# Importing necessary libraries
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
import itertools
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
#Reading the Data from the given file. Modify this as per the placement of the file.
data=pd.read_csv("Data - Parkinsons.csv")
data.head()
#data.tail()
| name | MDVP:Fo(Hz) | MDVP:Fhi(Hz) | MDVP:Flo(Hz) | MDVP:Jitter(%) | MDVP:Jitter(Abs) | MDVP:RAP | MDVP:PPQ | Jitter:DDP | MDVP:Shimmer | ... | Shimmer:DDA | NHR | HNR | status | RPDE | DFA | spread1 | spread2 | D2 | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | phon_R01_S01_1 | 119.992 | 157.302 | 74.997 | 0.00784 | 0.00007 | 0.00370 | 0.00554 | 0.01109 | 0.04374 | ... | 0.06545 | 0.02211 | 21.033 | 1 | 0.414783 | 0.815285 | -4.813031 | 0.266482 | 2.301442 | 0.284654 |
| 1 | phon_R01_S01_2 | 122.400 | 148.650 | 113.819 | 0.00968 | 0.00008 | 0.00465 | 0.00696 | 0.01394 | 0.06134 | ... | 0.09403 | 0.01929 | 19.085 | 1 | 0.458359 | 0.819521 | -4.075192 | 0.335590 | 2.486855 | 0.368674 |
| 2 | phon_R01_S01_3 | 116.682 | 131.111 | 111.555 | 0.01050 | 0.00009 | 0.00544 | 0.00781 | 0.01633 | 0.05233 | ... | 0.08270 | 0.01309 | 20.651 | 1 | 0.429895 | 0.825288 | -4.443179 | 0.311173 | 2.342259 | 0.332634 |
| 3 | phon_R01_S01_4 | 116.676 | 137.871 | 111.366 | 0.00997 | 0.00009 | 0.00502 | 0.00698 | 0.01505 | 0.05492 | ... | 0.08771 | 0.01353 | 20.644 | 1 | 0.434969 | 0.819235 | -4.117501 | 0.334147 | 2.405554 | 0.368975 |
| 4 | phon_R01_S01_5 | 116.014 | 141.781 | 110.655 | 0.01284 | 0.00011 | 0.00655 | 0.00908 | 0.01966 | 0.06425 | ... | 0.10470 | 0.01767 | 19.649 | 1 | 0.417356 | 0.823484 | -3.747787 | 0.234513 | 2.332180 | 0.410335 |
5 rows × 24 columns
data.shape
(195, 24)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 195 entries, 0 to 194 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 195 non-null object 1 MDVP:Fo(Hz) 195 non-null float64 2 MDVP:Fhi(Hz) 195 non-null float64 3 MDVP:Flo(Hz) 195 non-null float64 4 MDVP:Jitter(%) 195 non-null float64 5 MDVP:Jitter(Abs) 195 non-null float64 6 MDVP:RAP 195 non-null float64 7 MDVP:PPQ 195 non-null float64 8 Jitter:DDP 195 non-null float64 9 MDVP:Shimmer 195 non-null float64 10 MDVP:Shimmer(dB) 195 non-null float64 11 Shimmer:APQ3 195 non-null float64 12 Shimmer:APQ5 195 non-null float64 13 MDVP:APQ 195 non-null float64 14 Shimmer:DDA 195 non-null float64 15 NHR 195 non-null float64 16 HNR 195 non-null float64 17 status 195 non-null int64 18 RPDE 195 non-null float64 19 DFA 195 non-null float64 20 spread1 195 non-null float64 21 spread2 195 non-null float64 22 D2 195 non-null float64 23 PPE 195 non-null float64 dtypes: float64(22), int64(1), object(1) memory usage: 36.7+ KB
data.isnull().any()
name False MDVP:Fo(Hz) False MDVP:Fhi(Hz) False MDVP:Flo(Hz) False MDVP:Jitter(%) False MDVP:Jitter(Abs) False MDVP:RAP False MDVP:PPQ False Jitter:DDP False MDVP:Shimmer False MDVP:Shimmer(dB) False Shimmer:APQ3 False Shimmer:APQ5 False MDVP:APQ False Shimmer:DDA False NHR False HNR False status False RPDE False DFA False spread1 False spread2 False D2 False PPE False dtype: bool
Conclusion 1 : There are no Null Data or missing values in any of the columns.
data.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| MDVP:Fo(Hz) | 195.0 | 154.228641 | 41.390065 | 88.333000 | 117.572000 | 148.790000 | 182.769000 | 260.105000 |
| MDVP:Fhi(Hz) | 195.0 | 197.104918 | 91.491548 | 102.145000 | 134.862500 | 175.829000 | 224.205500 | 592.030000 |
| MDVP:Flo(Hz) | 195.0 | 116.324631 | 43.521413 | 65.476000 | 84.291000 | 104.315000 | 140.018500 | 239.170000 |
| MDVP:Jitter(%) | 195.0 | 0.006220 | 0.004848 | 0.001680 | 0.003460 | 0.004940 | 0.007365 | 0.033160 |
| MDVP:Jitter(Abs) | 195.0 | 0.000044 | 0.000035 | 0.000007 | 0.000020 | 0.000030 | 0.000060 | 0.000260 |
| MDVP:RAP | 195.0 | 0.003306 | 0.002968 | 0.000680 | 0.001660 | 0.002500 | 0.003835 | 0.021440 |
| MDVP:PPQ | 195.0 | 0.003446 | 0.002759 | 0.000920 | 0.001860 | 0.002690 | 0.003955 | 0.019580 |
| Jitter:DDP | 195.0 | 0.009920 | 0.008903 | 0.002040 | 0.004985 | 0.007490 | 0.011505 | 0.064330 |
| MDVP:Shimmer | 195.0 | 0.029709 | 0.018857 | 0.009540 | 0.016505 | 0.022970 | 0.037885 | 0.119080 |
| MDVP:Shimmer(dB) | 195.0 | 0.282251 | 0.194877 | 0.085000 | 0.148500 | 0.221000 | 0.350000 | 1.302000 |
| Shimmer:APQ3 | 195.0 | 0.015664 | 0.010153 | 0.004550 | 0.008245 | 0.012790 | 0.020265 | 0.056470 |
| Shimmer:APQ5 | 195.0 | 0.017878 | 0.012024 | 0.005700 | 0.009580 | 0.013470 | 0.022380 | 0.079400 |
| MDVP:APQ | 195.0 | 0.024081 | 0.016947 | 0.007190 | 0.013080 | 0.018260 | 0.029400 | 0.137780 |
| Shimmer:DDA | 195.0 | 0.046993 | 0.030459 | 0.013640 | 0.024735 | 0.038360 | 0.060795 | 0.169420 |
| NHR | 195.0 | 0.024847 | 0.040418 | 0.000650 | 0.005925 | 0.011660 | 0.025640 | 0.314820 |
| HNR | 195.0 | 21.885974 | 4.425764 | 8.441000 | 19.198000 | 22.085000 | 25.075500 | 33.047000 |
| status | 195.0 | 0.753846 | 0.431878 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| RPDE | 195.0 | 0.498536 | 0.103942 | 0.256570 | 0.421306 | 0.495954 | 0.587562 | 0.685151 |
| DFA | 195.0 | 0.718099 | 0.055336 | 0.574282 | 0.674758 | 0.722254 | 0.761881 | 0.825288 |
| spread1 | 195.0 | -5.684397 | 1.090208 | -7.964984 | -6.450096 | -5.720868 | -5.046192 | -2.434031 |
| spread2 | 195.0 | 0.226510 | 0.083406 | 0.006274 | 0.174351 | 0.218885 | 0.279234 | 0.450493 |
| D2 | 195.0 | 2.381826 | 0.382799 | 1.423287 | 2.099125 | 2.361532 | 2.636456 | 3.671155 |
| PPE | 195.0 | 0.206552 | 0.090119 | 0.044539 | 0.137451 | 0.194052 | 0.252980 | 0.527367 |
Multidimensional voice program (MDVP) is studied here.
Some ideas and concepts are grabbed from this paper : https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjSmpOF_4HuAhWd93MBHajlAAsQFjAKegQIDhAC&url=https%3A%2F%2Fcyberleninka.org%2Farticle%2Fn%2F389565.pdf&usg=AOvVaw1WGZGniDRjOtgaJ4T0MEBF
and this paper : https://www.scielo.br/scielo.php?pid=S1516-18462015000401341&script=sci_arttext&tlng=en
and this paper : https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5434464/
MDVP:Fo :
MDVP:Fhi :
MDVP:Flo :
MDVP:Jitter (in %):
MDVP:Jitter (Absolute Jitterness) :
MDVP:RAP :
MDVP:PPQ :
Jitter : DDP (Indicating the difference between jitter cycles) :
MDVP:Shimmer(dB) :
Shimmer:APQ3 (Three point amplitude Perturbation Quotient) :
Shimmer:APQ5 (Five point amplitude Perturbation Quotient) :
MDVP:APQ (Amplitude Perturbation Quotient) :
Shimmer:DDA :
NHR :
HNR :
Status :
RPDE (Relative Period Density Entropy) & DFA :
Rest of the Columns are nonlinear. We can't accurately guess what is going to be happening with them because the fitting might be something else with them. They need to be evaluated before anything.
#Creating a new correlation dataframe for further analysis.
data_correlation=data.corr() #Priting out the Correlation Table
data.corr()
| MDVP:Fo(Hz) | MDVP:Fhi(Hz) | MDVP:Flo(Hz) | MDVP:Jitter(%) | MDVP:Jitter(Abs) | MDVP:RAP | MDVP:PPQ | Jitter:DDP | MDVP:Shimmer | MDVP:Shimmer(dB) | ... | Shimmer:DDA | NHR | HNR | status | RPDE | DFA | spread1 | spread2 | D2 | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MDVP:Fo(Hz) | 1.000000 | 0.400985 | 0.596546 | -0.118003 | -0.382027 | -0.076194 | -0.112165 | -0.076213 | -0.098374 | -0.073742 | ... | -0.094732 | -0.021981 | 0.059144 | -0.383535 | -0.383894 | -0.446013 | -0.413738 | -0.249450 | 0.177980 | -0.372356 |
| MDVP:Fhi(Hz) | 0.400985 | 1.000000 | 0.084951 | 0.102086 | -0.029198 | 0.097177 | 0.091126 | 0.097150 | 0.002281 | 0.043465 | ... | -0.003733 | 0.163766 | -0.024893 | -0.166136 | -0.112404 | -0.343097 | -0.076658 | -0.002954 | 0.176323 | -0.069543 |
| MDVP:Flo(Hz) | 0.596546 | 0.084951 | 1.000000 | -0.139919 | -0.277815 | -0.100519 | -0.095828 | -0.100488 | -0.144543 | -0.119089 | ... | -0.150737 | -0.108670 | 0.210851 | -0.380200 | -0.400143 | -0.050406 | -0.394857 | -0.243829 | -0.100629 | -0.340071 |
| MDVP:Jitter(%) | -0.118003 | 0.102086 | -0.139919 | 1.000000 | 0.935714 | 0.990276 | 0.974256 | 0.990276 | 0.769063 | 0.804289 | ... | 0.746635 | 0.906959 | -0.728165 | 0.278220 | 0.360673 | 0.098572 | 0.693577 | 0.385123 | 0.433434 | 0.721543 |
| MDVP:Jitter(Abs) | -0.382027 | -0.029198 | -0.277815 | 0.935714 | 1.000000 | 0.922911 | 0.897778 | 0.922913 | 0.703322 | 0.716601 | ... | 0.697170 | 0.834972 | -0.656810 | 0.338653 | 0.441839 | 0.175036 | 0.735779 | 0.388543 | 0.310694 | 0.748162 |
| MDVP:RAP | -0.076194 | 0.097177 | -0.100519 | 0.990276 | 0.922911 | 1.000000 | 0.957317 | 1.000000 | 0.759581 | 0.790652 | ... | 0.744919 | 0.919521 | -0.721543 | 0.266668 | 0.342140 | 0.064083 | 0.648328 | 0.324407 | 0.426605 | 0.670999 |
| MDVP:PPQ | -0.112165 | 0.091126 | -0.095828 | 0.974256 | 0.897778 | 0.957317 | 1.000000 | 0.957319 | 0.797826 | 0.839239 | ... | 0.763592 | 0.844604 | -0.731510 | 0.288698 | 0.333274 | 0.196301 | 0.716489 | 0.407605 | 0.412524 | 0.769647 |
| Jitter:DDP | -0.076213 | 0.097150 | -0.100488 | 0.990276 | 0.922913 | 1.000000 | 0.957319 | 1.000000 | 0.759555 | 0.790621 | ... | 0.744901 | 0.919548 | -0.721494 | 0.266646 | 0.342079 | 0.064026 | 0.648328 | 0.324377 | 0.426556 | 0.671005 |
| MDVP:Shimmer | -0.098374 | 0.002281 | -0.144543 | 0.769063 | 0.703322 | 0.759581 | 0.797826 | 0.759555 | 1.000000 | 0.987258 | ... | 0.987626 | 0.722194 | -0.835271 | 0.367430 | 0.447424 | 0.159954 | 0.654734 | 0.452025 | 0.507088 | 0.693771 |
| MDVP:Shimmer(dB) | -0.073742 | 0.043465 | -0.119089 | 0.804289 | 0.716601 | 0.790652 | 0.839239 | 0.790621 | 0.987258 | 1.000000 | ... | 0.963202 | 0.744477 | -0.827805 | 0.350697 | 0.410684 | 0.165157 | 0.652547 | 0.454314 | 0.512233 | 0.695058 |
| Shimmer:APQ3 | -0.094717 | -0.003743 | -0.150747 | 0.746625 | 0.697153 | 0.744912 | 0.763580 | 0.744894 | 0.987625 | 0.963198 | ... | 1.000000 | 0.716207 | -0.827123 | 0.347617 | 0.435242 | 0.151124 | 0.610967 | 0.402243 | 0.467265 | 0.645377 |
| Shimmer:APQ5 | -0.070682 | -0.009997 | -0.101095 | 0.725561 | 0.648961 | 0.709927 | 0.786780 | 0.709907 | 0.982835 | 0.973751 | ... | 0.960072 | 0.658080 | -0.813753 | 0.351148 | 0.399903 | 0.213873 | 0.646809 | 0.457195 | 0.502174 | 0.702456 |
| MDVP:APQ | -0.077774 | 0.004937 | -0.107293 | 0.758255 | 0.648793 | 0.737455 | 0.804139 | 0.737439 | 0.950083 | 0.960977 | ... | 0.896647 | 0.694019 | -0.800407 | 0.364316 | 0.451379 | 0.157276 | 0.673158 | 0.502188 | 0.536869 | 0.721694 |
| Shimmer:DDA | -0.094732 | -0.003733 | -0.150737 | 0.746635 | 0.697170 | 0.744919 | 0.763592 | 0.744901 | 0.987626 | 0.963202 | ... | 1.000000 | 0.716215 | -0.827130 | 0.347608 | 0.435237 | 0.151132 | 0.610971 | 0.402223 | 0.467261 | 0.645389 |
| NHR | -0.021981 | 0.163766 | -0.108670 | 0.906959 | 0.834972 | 0.919521 | 0.844604 | 0.919548 | 0.722194 | 0.744477 | ... | 0.716215 | 1.000000 | -0.714072 | 0.189429 | 0.370890 | -0.131882 | 0.540865 | 0.318099 | 0.470949 | 0.552591 |
| HNR | 0.059144 | -0.024893 | 0.210851 | -0.728165 | -0.656810 | -0.721543 | -0.731510 | -0.721494 | -0.835271 | -0.827805 | ... | -0.827130 | -0.714072 | 1.000000 | -0.361515 | -0.598736 | -0.008665 | -0.673210 | -0.431564 | -0.601401 | -0.692876 |
| status | -0.383535 | -0.166136 | -0.380200 | 0.278220 | 0.338653 | 0.266668 | 0.288698 | 0.266646 | 0.367430 | 0.350697 | ... | 0.347608 | 0.189429 | -0.361515 | 1.000000 | 0.308567 | 0.231739 | 0.564838 | 0.454842 | 0.340232 | 0.531039 |
| RPDE | -0.383894 | -0.112404 | -0.400143 | 0.360673 | 0.441839 | 0.342140 | 0.333274 | 0.342079 | 0.447424 | 0.410684 | ... | 0.435237 | 0.370890 | -0.598736 | 0.308567 | 1.000000 | -0.110950 | 0.591117 | 0.479905 | 0.236931 | 0.545886 |
| DFA | -0.446013 | -0.343097 | -0.050406 | 0.098572 | 0.175036 | 0.064083 | 0.196301 | 0.064026 | 0.159954 | 0.165157 | ... | 0.151132 | -0.131882 | -0.008665 | 0.231739 | -0.110950 | 1.000000 | 0.195668 | 0.166548 | -0.165381 | 0.270445 |
| spread1 | -0.413738 | -0.076658 | -0.394857 | 0.693577 | 0.735779 | 0.648328 | 0.716489 | 0.648328 | 0.654734 | 0.652547 | ... | 0.610971 | 0.540865 | -0.673210 | 0.564838 | 0.591117 | 0.195668 | 1.000000 | 0.652358 | 0.495123 | 0.962435 |
| spread2 | -0.249450 | -0.002954 | -0.243829 | 0.385123 | 0.388543 | 0.324407 | 0.407605 | 0.324377 | 0.452025 | 0.454314 | ... | 0.402223 | 0.318099 | -0.431564 | 0.454842 | 0.479905 | 0.166548 | 0.652358 | 1.000000 | 0.523532 | 0.644711 |
| D2 | 0.177980 | 0.176323 | -0.100629 | 0.433434 | 0.310694 | 0.426605 | 0.412524 | 0.426556 | 0.507088 | 0.512233 | ... | 0.467261 | 0.470949 | -0.601401 | 0.340232 | 0.236931 | -0.165381 | 0.495123 | 0.523532 | 1.000000 | 0.480585 |
| PPE | -0.372356 | -0.069543 | -0.340071 | 0.721543 | 0.748162 | 0.670999 | 0.769647 | 0.671005 | 0.693771 | 0.695058 | ... | 0.645389 | 0.552591 | -0.692876 | 0.531039 | 0.545886 | 0.270445 | 0.962435 | 0.644711 | 0.480585 | 1.000000 |
23 rows × 23 columns
#checking to see if the "name" column is unique, meaning it is only an index, and serves no purpose
#since it is a string object.
data["name"].is_unique
True
#dropping the name column for further analysis.
data_name_dropped=data.drop("name",axis=1)
#data_name_dropped["status"].convert_dtype("int")
data_name_dropped
| MDVP:Fo(Hz) | MDVP:Fhi(Hz) | MDVP:Flo(Hz) | MDVP:Jitter(%) | MDVP:Jitter(Abs) | MDVP:RAP | MDVP:PPQ | Jitter:DDP | MDVP:Shimmer | MDVP:Shimmer(dB) | ... | Shimmer:DDA | NHR | HNR | status | RPDE | DFA | spread1 | spread2 | D2 | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 119.992 | 157.302 | 74.997 | 0.00784 | 0.00007 | 0.00370 | 0.00554 | 0.01109 | 0.04374 | 0.426 | ... | 0.06545 | 0.02211 | 21.033 | 1 | 0.414783 | 0.815285 | -4.813031 | 0.266482 | 2.301442 | 0.284654 |
| 1 | 122.400 | 148.650 | 113.819 | 0.00968 | 0.00008 | 0.00465 | 0.00696 | 0.01394 | 0.06134 | 0.626 | ... | 0.09403 | 0.01929 | 19.085 | 1 | 0.458359 | 0.819521 | -4.075192 | 0.335590 | 2.486855 | 0.368674 |
| 2 | 116.682 | 131.111 | 111.555 | 0.01050 | 0.00009 | 0.00544 | 0.00781 | 0.01633 | 0.05233 | 0.482 | ... | 0.08270 | 0.01309 | 20.651 | 1 | 0.429895 | 0.825288 | -4.443179 | 0.311173 | 2.342259 | 0.332634 |
| 3 | 116.676 | 137.871 | 111.366 | 0.00997 | 0.00009 | 0.00502 | 0.00698 | 0.01505 | 0.05492 | 0.517 | ... | 0.08771 | 0.01353 | 20.644 | 1 | 0.434969 | 0.819235 | -4.117501 | 0.334147 | 2.405554 | 0.368975 |
| 4 | 116.014 | 141.781 | 110.655 | 0.01284 | 0.00011 | 0.00655 | 0.00908 | 0.01966 | 0.06425 | 0.584 | ... | 0.10470 | 0.01767 | 19.649 | 1 | 0.417356 | 0.823484 | -3.747787 | 0.234513 | 2.332180 | 0.410335 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190 | 174.188 | 230.978 | 94.261 | 0.00459 | 0.00003 | 0.00263 | 0.00259 | 0.00790 | 0.04087 | 0.405 | ... | 0.07008 | 0.02764 | 19.517 | 0 | 0.448439 | 0.657899 | -6.538586 | 0.121952 | 2.657476 | 0.133050 |
| 191 | 209.516 | 253.017 | 89.488 | 0.00564 | 0.00003 | 0.00331 | 0.00292 | 0.00994 | 0.02751 | 0.263 | ... | 0.04812 | 0.01810 | 19.147 | 0 | 0.431674 | 0.683244 | -6.195325 | 0.129303 | 2.784312 | 0.168895 |
| 192 | 174.688 | 240.005 | 74.287 | 0.01360 | 0.00008 | 0.00624 | 0.00564 | 0.01873 | 0.02308 | 0.256 | ... | 0.03804 | 0.10715 | 17.883 | 0 | 0.407567 | 0.655683 | -6.787197 | 0.158453 | 2.679772 | 0.131728 |
| 193 | 198.764 | 396.961 | 74.904 | 0.00740 | 0.00004 | 0.00370 | 0.00390 | 0.01109 | 0.02296 | 0.241 | ... | 0.03794 | 0.07223 | 19.020 | 0 | 0.451221 | 0.643956 | -6.744577 | 0.207454 | 2.138608 | 0.123306 |
| 194 | 214.289 | 260.277 | 77.973 | 0.00567 | 0.00003 | 0.00295 | 0.00317 | 0.00885 | 0.01884 | 0.190 | ... | 0.03078 | 0.04398 | 21.209 | 0 | 0.462803 | 0.664357 | -5.724056 | 0.190667 | 2.555477 | 0.148569 |
195 rows × 23 columns
Trying to estimate on the preliminary basis what columns seem to have a strong correlation with the target column viz. status, which is also categorical variable.
#We plot here the general overlook in a heatmap the correlation to map out which things need to be target and to get
#a sense of what is happening in the data.
plt.figure(figsize=(25,12))
plt.title("Heat map for correlation between features")
sns.heatmap(data_correlation,annot=True)
plt.show()
#########################################################################################################
#########################################################################################################
#This function lets you take the pairs of feature columns whose correlation value is given by the user.
def features_to_be_modelled_as_pairs(data_correlation,correlation_value):
columns_list_export=[]
# You can change the value here based on the user Choice and the whole program will result in appropriate ways
columns_to_model=data_correlation.columns
no_of_columns=len(columns_to_model)
#print(columns_to_model)
for column1 in columns_to_model:
for column2 in columns_to_model:
if column1==column2:
pass
else:
if data_correlation[column1][column2]>=correlation_value:
#print(data_correlation[column1][column2])
if [column2,column1] not in columns_list_export:
columns_list_export.append([column1,column2])
#columns_list_export.append([column1,column2])
#To eliminate repeated entries
#for ind in range(len())
return columns_list_export
###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################
#This function will generate an iterable list of columns that can actually be modelled and
#further processes for Pandas.
#The Columns that are exhibiting the threshold of correlation
#are separated into a list for better access
def individual_iter_columns_as_list(columns_list_export):
columns_list=columns_list_export
Model_columns=[]
#print(columns_list_export)
for item1,item2 in columns_list_export:
if item1 not in Model_columns:
Model_columns.append(item1)
if item2 not in Model_columns:
Model_columns.append(item2)
#print(Model_columns)
return Model_columns
##########################################################################
##########################################################################
##########################################################################
##########################################################################
##########################################################################
#This is the function that is for conditioning.
def correlation_complete_function(data_correlation,correlation_value):
checking_list=features_to_be_modelled_as_pairs(data_correlation,correlation_value)
return checking_list,individual_iter_columns_as_list(checking_list)
##########################################################################
#########################################################################
#Make this correlation ratio as you wish.
correlation_threshold=0.9
###############################################################################
# Uncomment the below functions to check the working of the functions above
print("The Following columns (expressed as a pair) are having high correlation amongst themselves :")
temp1=features_to_be_modelled_as_pairs(data_correlation,correlation_threshold)
print("\n",temp1)
print(" \nThe Following list of columns (expressed as a single iterable list) have correlation ratio greater than {} :".format(correlation_threshold))
Model_columns = individual_iter_columns_as_list(features_to_be_modelled_as_pairs(data_correlation,correlation_threshold))
print("\n",Model_columns)
#print(correlation_complete_function(data_correlation,correlation_threshold)[0],"\n\n\n",correlation_complete_function(data_correlation,correlation_threshold)[1])
The Following columns (expressed as a pair) are having high correlation amongst themselves : [['MDVP:Jitter(%)', 'MDVP:Jitter(Abs)'], ['MDVP:Jitter(%)', 'MDVP:RAP'], ['MDVP:Jitter(%)', 'MDVP:PPQ'], ['MDVP:Jitter(%)', 'Jitter:DDP'], ['MDVP:Jitter(%)', 'NHR'], ['MDVP:Jitter(Abs)', 'MDVP:RAP'], ['MDVP:Jitter(Abs)', 'Jitter:DDP'], ['MDVP:RAP', 'MDVP:PPQ'], ['MDVP:RAP', 'Jitter:DDP'], ['MDVP:RAP', 'NHR'], ['MDVP:PPQ', 'Jitter:DDP'], ['Jitter:DDP', 'NHR'], ['MDVP:Shimmer', 'MDVP:Shimmer(dB)'], ['MDVP:Shimmer', 'Shimmer:APQ3'], ['MDVP:Shimmer', 'Shimmer:APQ5'], ['MDVP:Shimmer', 'MDVP:APQ'], ['MDVP:Shimmer', 'Shimmer:DDA'], ['MDVP:Shimmer(dB)', 'Shimmer:APQ3'], ['MDVP:Shimmer(dB)', 'Shimmer:APQ5'], ['MDVP:Shimmer(dB)', 'MDVP:APQ'], ['MDVP:Shimmer(dB)', 'Shimmer:DDA'], ['Shimmer:APQ3', 'Shimmer:APQ5'], ['Shimmer:APQ3', 'Shimmer:DDA'], ['Shimmer:APQ5', 'MDVP:APQ'], ['Shimmer:APQ5', 'Shimmer:DDA'], ['spread1', 'PPE']] The Following list of columns (expressed as a single iterable list) have correlation ratio greater than 0.9 : ['MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'NHR', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'spread1', 'PPE']
#checking correlation for the targeted columns from the above
#change the correlation value in the above cell for proper analysis
Model_columns=correlation_complete_function(data_correlation,correlation_threshold)[1]
plt.figure(figsize=(20,12))
plt.title("Heat map for correlation between the new set of features")
sns.heatmap(data_correlation[Model_columns],annot=True)
plt.show()
Frist Checking the target column to get familiar of how it is given.
#First checking the countplot of the target column so that we get a preliminary idea of it.
sns.countplot(data["status"].replace({1:"Yes",0:"No"}))
print("\n The total number of Patients who have diagnosed with PD are :",data[data["status"]==1].shape[0])
print("\n The total number of Patients who have diagnosed tobe not with PD are :",data[data["status"]==0].shape[0])
The total number of Patients who have diagnosed with PD are : 147 The total number of Patients who have diagnosed tobe not with PD are : 48
#Now checking how the distribution of the other two columns is
plt.figure(figsize=(20,6))
#dealing with all of its characteristics in one row for "PPE" Column
#PPE Column
plt.subplot(1, 2, 1)
plt.title("Distrituion of PPE")
sns.distplot(data["PPE"])
plt.subplot(1, 2, 2)
plt.title("Outliers of 'PPE' Column")
sns.boxplot(data["PPE"])
#plt.subplot(1, 3, 3)
#plt.title("Correlation Plot for PPE")
sns.jointplot(x=data["PPE"],y=data["status"],kind='reg')
plt.figure(figsize=(20,6))
#Spred1 Column
plt.subplot(1, 2, 1)
plt.title("Distrituion of spread1")
sns.distplot(data["spread1"])
plt.subplot(1, 2, 2)
plt.title("Outliers of spread1 Column")
sns.boxplot(data["spread1"])
#plt.subplot(1, 3, 3)
#plt.title("Distrituion of spread1")
sns.jointplot(x=data["spread1"],y=data["status"],kind='reg')
<seaborn.axisgrid.JointGrid at 0x1e9e0fb1fd0>
There seems to be some outliers within the data for "spread1" Column and "PPE" column. And those outliers are exclusively restricted to the Quartile3 i.e. above the maximum value and doesn't seem to be below the lower Quartile value. That is an important observation of skew of the data.
Let us now go into more detail
Let us do some preparatory work for flexibility and tuning so that we can arbitrarily switch to what columns we choose and how we can deal with the features, dropping them as we wish and dealing with their distribution.
Please Note : I have extensively used the functioning programming approach to these. But better practice would be to use classes, which I didn't use. I plan to do it in future projects.
First doing some pairplot to get a sense of the entire data.
#this is a Useless plot that shows nothing that can be properly deciphered.
# We have hue to "status" because it is a column of interest for us.
#color_palette=sns.color_palette("hls")
sns.pairplot(data,hue='status')
<seaborn.axisgrid.PairGrid at 0x1e9e68a3fd0>
Setting up of a few Functions that will help us eventually
#Please Note : Uncomment the final sentences in this cell to actually use
#the customization options. Due to limitations placed on my computer memory
#I choose disply or render things.
##############################################################################
##############################################################################
#Checking for the plot type choice. You can add as many types of plots are you
#want here with proper key and it will work out appropriately.
def plot_type(key): #You can add more options here depending on your choice
if key =="boxplot":
return sns.boxplot
if key=="distplot":
return sns.distplot
##################################################################################
##################################################################################
##################################################################################
##################################################################################
##################################################################################
#This is the function that actually plots different distribution plots separately
# and fixing the canvas situation for proper presentation.
def plotting_map(data_name_dropped,no_of_cols=1,fig_size=(20,6),key="distplot"):
#plt.figure(figsize=fig_size) #The actual size
#colors=["red","green","yellow","blue","orange"]
columns=list(data_name_dropped.columns)
cols=no_of_cols
temp_var=0
col_len=len(columns)
offset=0
count=0
temp_var=int(col_len/cols)+1
colors=sns.color_palette()
if col_len>cols:
rows=temp_var
offset=0
else:
rows=1
for row in range(rows):
plt.figure(figsize=fig_size)
for var in range(cols):
#print(cols)
#print("row :",row+1,"var :",var+1,"offset+var :",var+offset,"temp_var :",temp_var)
#print(columns[var+offset])
#print (row,cols,var,temp_var,col_len,offset)
#test=data_name_dropped[columns[var+offset]]
#print(test)
#print("\n Loop ended \n")
plt.subplot(row+1, cols, var+1)
plt.title(columns[var+offset])
plot_chosen=plot_type(key)
plot_chosen(data_name_dropped[columns[var+offset]],color=colors[var])
count+=1
#print("loop count",count)
if var==cols-1:
#print("offest before :",offset)
offset+=cols
#print("offset after :",offset)
#print("\n")
if count==col_len:
#cols=(rows+1)*cols
#print("cols :",cols)
#print("out of range index : ")
break
#plt.show()
#print(row,cols,var,temp_var,col_len,offset)
################################################################
################################################################
################################################################
################################################################
################################################################
#creating a better canvas by proper conditioning and supply of the data to the
#the above function.
def neat_plot_univariate(dataframe_given,no_of_columns_selected,fig_dim=(18,6),plot_kind="distplot"):
column_ind=0
for index in range(0,len(dataframe_given.columns),no_of_columns_selected) :
#index_2=index
#index_2+=no_of_columns_selected
#plotting_map()
column_ind=column_ind+no_of_columns_selected
#print("\n",index,column_ind,"\n")
plotting_map(dataframe_given.iloc[:,index:column_ind],no_of_cols=no_of_columns_selected,fig_size=fig_dim,key=plot_kind)
#################################################################
#################################################################
#actually plotting the data here. Uncomment and run to see the functioning.
#neat_plot_univariate(data_name_dropped,no_of_columns_selected=3,fig_dim=(18,6),plot_kind="distplot")
#plt.close()
#Please Note : Uncomment the final sentences in this cell to actually use
#the customization options. Due to limitations placed on my computer memory
#I choose not to display or render things.
########################################################
########################################################
#function to choose what kind of plot we want. We can add more and more plots
#with appropriate keys from the library.
def plot_chose_function(key):
if key=="jointplot":
return sns.jointplot
if key=="catplot":
return sns.catplot
if key=="scatterplot":
return sns.scatterplot
############################################################################
############################################################################
############################################################################
################################################################################
################################################################################
# Create some iterable list from a given dataframe for which we want to create
#joint plots that are actually discernible and seen.
#We can also generate an iterable list based on the target column that we would wish
#to supply so that we can go ahead with plotting the appropriately data for further processing.
def iterable_list_for_bivariate_plot(data_name_dropped,chosen_column=False):
data_given=data_name_dropped
columns=list(data_given.columns)
iterable_list=[]
#print(columns)
var_col=len(columns)
loop_count=0
i=len(columns)-1
while i!=0:
for j in range(0,i):
if j==i:
i-=1
#print("F",columns[j])
#print("R",columns[i])
if chosen_column:
if [columns[i],chosen_column] in iterable_list:
break
iterable_list.append([columns[i],chosen_column])
else:
iterable_list.append([columns[j],columns[i]])
#print("\n i :",i,"\n","j :",j)
loop_count+=1
#print("Inside for loop : \n",loop_count)
#loop_count=0
#print("outside for loop")
#print(iterable_list[i],"\n")
i-=1
return iterable_list
###########################################################################
###########################################################################
#############################################################################
############################################################################
############################################################################
#The Actual fuction that plots the data with appropriate options and make some
#options for the plot.
def bivariate_plot(data_given,column_chosen_for_comparative=False,columns_chosen=1,fig_size=(8,8),key="jointplot",kind_chosen='reg'):
if column_chosen_for_comparative:
iterable_list_go=iterable_list_for_bivariate_plot(data_given,chosen_column=column_chosen_for_comparative)
else:
iterable_list_go=iterable_list_for_bivariate_plot(data_given)
#print(iterable_list_go)
iterable_len=len(iterable_list_go)
count=iterable_len
#print(iterable_len)
cols=columns_chosen
temp_var=int(iterable_len/cols)+1
colors=sns.color_palette()
if iterable_len>cols:
rows=temp_var
#offset=0
else:
rows=1
#plt.figure(figsize=fig_size)
#print(temp_var)
for row_index in range(rows):
plt.figure(figsize=fig_size)
if count==0:
break
if count<cols:
cols=count
#print("\nCols :",count)
for col_index in range(cols):
#print("\n row_index ",row_index,"\n col_index :",col_index)
#print("\n",iterable_list_go[col_index][0])
#print("\n",iterable_list_go[col_index][1])
x_name=iterable_list_go[col_index][0]
y_name=iterable_list_go[col_index][1]
#print("\n x_name :",x_name,"\n y_name :",y_name)
#print(data_given[str.format(x_name)])
#plt.subplot(row_index+1,cols+1,col_index+1)
#plt.title(x_name+" Vs "+y_name)
plot_chosen=plot_chose_function(key)
plot_chosen(x=data_given[x_name],y=data_given[y_name],kind=kind_chosen)
#plt.show()
#print("\ncount for inside :",iterable_len-count)
count-=cols
plt.show()
plt.close()
################################################################################
################################################################################
########################################################################################
################################################################################
################################################################################
#creating a more better and neaterplot for the same as above for a clear presentation
def neat_plot_bivariate(dataframe_given,no_of_columns_selected,column_chosen_for_correlation=None,fig_dim=None,plot_kind="jointplot",kind_chosen_user="reg"):
column_ind=0
input_ind=len(dataframe_given.columns)
for index in range(0,input_ind,no_of_columns_selected) :
#index_2=index
#index_2+=no_of_columns_selected
if input_ind>no_of_columns_selected:
column_ind=column_ind+no_of_columns_selected
else :
column_ind=column_ind+input_ind
input_ind=input_ind-no_of_columns_selected
#print("\n",column_ind/no_of_columns_selected,"\n")
#print("\n",index,column_ind,input_ind,"\n")
#print("\n")
#dataframe_given.iloc[:,index:column_ind].info()
bivariate_plot(dataframe_given.iloc[:,index:column_ind],columns_chosen=no_of_columns_selected,column_chosen_for_comparative=column_chosen_for_correlation,fig_size=fig_dim,key=plot_kind,kind_chosen=kind_chosen_user)
#########################################################################################
#########################################################################################
#modify the plot here. I just used the first few columns here. But, we can only take target columns
#with very high correlation to see how they are related to each other i.e. relationship. We do it much later.
#Uncomment the below functions to see the actual working of it. The first 8 columns are selected here.
#iterable_list_for_bivariate_plot(data_name_dropped,chosen_column="status")
#bivariate_plot(data_name_dropped,columns_chosen=3,fig_size=(20,6),key="jointplot",kind_chosen='reg')
#neat_plot_bivariate(data_name_dropped.iloc[:,:8],fig_dim=(20,6),no_of_columns_selected=3,kind_chosen_user='reg',plot_kind='jointplot')
We are going to set a target column here and the correlation threshold for that target column and create a function that will give us the
#Setting here the target columns as supplied the posed question. Here the target column is "status" column.
# And once again setting up the Correlation Threshold here as per the requirement of the project.
#These are the customization options that can be used.
#Data supplied here
target_column="status"
correlation_threshold_for_target=0.5
##########################################################
#########################################################
########################################################
#########################################################
#########################################################
def target_column_correlation(dataframe_supplied, target_column,correlation_threshold_for_target):
data_given=dataframe_supplied
correlation_threshold=correlation_threshold_for_target
data_supplied_corr=dataframe_supplied.corr()
linked_columns=[]
#data_supplied_corr=data_supplied_corr[data_supplied_corr[target_column]>target_column]
#data_given_correlation=
#print(data_given.index)
for index in data_supplied_corr:
#print("\n The index is :",index)
if data_supplied_corr[target_column][index]>correlation_threshold_for_target:
linked_columns.append(index)
#print(data_supplied_corr[target_column][index])
#print(type(data_supplied_corr))
return linked_columns,data_supplied_corr[target_column][linked_columns]
#target_column=target_column
#target_column_correlation (data_name_dropped,target_column,correlation_threshold_for_target)
#####################################################################################
#####################################################################################
#####################################################################################
####################################################################################
######################################################################################
#Plotting those correlated Quantities with our target column
def target_column_corr_plotting(dataframe_supplied,target_column,correlation_threshold_for_target,kind_chosen_user='hex',plot_kind='jointplot'):
linked_columns,data_supplied_corr=target_column_correlation(dataframe_supplied,target_column,correlation_threshold_for_target)
#if target_column in linked_columns:
#linked_columns.remove(target_column)
#print(linked_columns)
dataframe_finished=dataframe_supplied[linked_columns]
#if target_column in dataframe_finished.columns:
#dataframe_finished=dataframe_finished.drop(target_column,inplace=True,axis=1)
#print("The Following is the correlation data for the given threshold for the target column :\n")
#print(data_supplied_corr)
neat_plot_bivariate(dataframe_finished,fig_dim=(20,6),no_of_columns_selected=3,column_chosen_for_correlation=target_column,kind_chosen_user=kind_chosen_user,plot_kind=plot_kind)
##############################################################################3
###############################################################################
#Testing the above functions with the function call below
#target_column_corr_plotting(data_name_dropped,target_column,correlation_threshold_for_target,kind_chosen_user='kde',plot_kind="jointplot")
Now that we have developed highly customizable functions to choose columns/features as we like and for whatever data, in future projects and everywhere, let us now move ahead with setting up a similar stance for Outlier identification and elimination with high customizability for future projects and this project.
Moving to actual Analysis here.
# Observing the same data in a kind of relationship of categorical variable sense
# to observe the patterns of Logistic Regression in the following plots, so that they act
# as a kind of show of what model might best fit. But we will see
######################################################################
#####################################################################
print("The following list of columns have high correlation with the given '{}' column :\n".format(target_column))
print(target_column_correlation(data_name_dropped,target_column,correlation_threshold_for_target)[0])
print("\nPlotting correlation among those columns :\n")
target_column_corr_plotting(data_name_dropped,target_column,correlation_threshold_for_target,kind_chosen_user='hex',plot_kind="jointplot")
The following list of columns have high correlation with the given 'status' column : ['status', 'spread1', 'PPE'] Plotting correlation among those columns :
<Figure size 1440x432 with 0 Axes>
#Extreme Values are eliminated from the Data here using the functions properly.
#####################################################################
####################################################################
###################################################################
##################################################################
###################################################################
#A Function designed such that a dataframe of correlated columns is generated
#such that they can be used for further easy processing to extract index of all
#the data points such that they can be dropped from our original dataframe.
def df_wo_outlier_output (dataframe_given,Quartiles_given=False):
#passing and processing data so that proper processing is done for plotting
df_ready=dataframe_given.copy()
columns=dataframe_given.columns
Quartiles_index=Quartiles_given.T.index
#print(Quartiles_index)
#print(columns)
for column in columns:
if column in Quartiles_index:
pass
else:
df_ready.drop(column,axis=1,inplace=True)
return df_ready
##############################################################################
#############################################################################
#The data frame supplied here is the dataframe that has the "name" column dropped.
# We can directly call here the function to give the columns so that their correlation
# is high with target column and has outliers.
target_column="status"
correlation_threshold_for_target=0.5
columns_given=target_column_correlation(data_name_dropped,target_column,correlation_threshold_for_target)[0]
##############################################################################
##############################################################################
#Testing with the data below
#Quartiles_data_setting(data_name_dropped,columns_given=exclusive_columns)
#df_wo_outlier_output(data_name_dropped,Quartiles_given=Quartiles_data_setting(data_name_dropped,columns_given=exclusive_columns))
#Extreme Values are eliminated from the Data here using the functions properly.
##############################################################################
#############################################################################
########################################################################
#############################################################################
#################################################################################
#A Very handy function to tweak the threshold and settinf or percentile settings for
#higher analysis of all kinds.
def percentile_threshold_tweaking(data_fr_received,Quartiles_passed,percentile_threshold=(25,50,75)):
#data_fr_received=data_received
#print(data_fr_received)
for columns in data_fr_received.columns:
data_fr_received[columns]["25%"]=data_fr_received[columns].quantile(percentile_threshold[0]/100)
Quartiles_passed.rename({'25%':'{}'.format(percentile_threshold[0])+"%"}, inplace = True)
data_fr_received[columns]["75%"]=data_fr_received[columns].quantile(percentile_threshold[2]/100)
Quartiles_passed.rename({'75%':'{}'.format(percentile_threshold[2])+"%"}, inplace = True)
data_fr_received[columns]["50%"]=data_fr_received[columns].quantile(percentile_threshold[1]/100)
Quartiles_passed.rename({'50%':'{}'.format(percentile_threshold[1])+"%"}, inplace = True)
return Quartiles_passed
#####################################################################
####################################################################
###################################################################
##################################################################
###################################################################
#Function for setting up the data such that Quartiles are given out in proper
#format for further processing. This can be our choice here. Can be called from
#anywhere or within other function.
#Note : Though I have used the default describe() method to access percentile, we can alter it
#by the function here but not used.
def Quartiles_data_setting(data_given,columns_given=False,percentile_threshold_given=False):
data_fr=data_given
Quartiles=data_given.describe()
if columns_given:
Quartiles=Quartiles[columns_given]
column_flag=columns_given
else:
column_flag=data_given.columns
if percentile_threshold_given: #User can define their own function on to decide what is the threshold
Quartiles=percentile_threshold_tweaking(data_fr[column_flag],percentile_threshold=percentile_threshold_given,Quartiles_passed=Quartiles)
#print(columns_given)
#print(Quartiles)
#print(Quartiles)
return Quartiles
else:
return Quartiles
#####################################################################
####################################################################
###################################################################
##################################################################
###################################################################
#A Function designed such that we get a list of the indexes and their nature of outlying
#for a chosen set of columns which might have outliers. Designed such that we might take
# a decision whether or not to exclude the outliers, whether or not they make any significant
#effect on the data, and to what extent do they make a difference, on everything. We can also
# decide on what side of the spectrum of outliers we might want to eliminate.
def index_list_for_outliers (dataframe_given,Quartiles_given_by_user=False):
#passing and processing data so that proper processing is done for plotting
df_ready=dataframe_given.copy()
#original_columns=dataframe_given.columns
#print(Quartiles_given_by_user)
Quartiles_given=Quartiles_given_by_user
#if Quartiles_given_by_user.empty:
#Quartiles_given=Quartiles_given_by_user
#else:
#Quartiles_given=Quartiles_data_setting(df_ready,Quartiles_given_by_user,percentile_threshold_given=percentile_threshold)
Quartiles_index=Quartiles_given_by_user.T.index
Quartiles_columns=Quartiles_given_by_user.index
#print(Quartiles_columns)
#print(Quartiles_given_by_user["spread1"]["65%"])
#print(Quartiles_index)
location_index=[]
percentile_threshold_set=[str(Quartiles_columns[4]),str(Quartiles_columns[5]),str(Quartiles_columns[6])]
#print(percentile_threshold_set)
for index1 in Quartiles_index:
temp_list=[]
for loc_ind_value in df_ready[index1].index:
#temp_list=[]
flag_temp=None
if df_ready[index1][loc_ind_value]<Quartiles_given[index1][percentile_threshold_set[0]]:
temp_list.append(loc_ind_value)
fomat=index1+"_Q1"
flag_temp=True
elif df_ready[index1][loc_ind_value]>Quartiles_given[index1][percentile_threshold_set[2]]:
fomat=index1+"_Q3"
flag_temp=True
else:
flag_temp=False
pass
if loc_ind_value not in temp_list:
if flag_temp:
location_index.append({fomat:loc_ind_value})
return location_index
#We can convert this into a dataframe for further wonderful analysis and more higher analysis.
#This generates an array in the format of "{<column>_<Quartile>:<index value of the particular Outlier>"
# we can modify the function to choose a threshold value by supplying percentile_threshold and appropriately
# modfying the Quartiles_given by giving
#More higher level analysis can be done by declaring functions here and that is properly done in
#future projects.
#################################################################################
#################################################################################
#################################################################################
###################################################################################
##################################################################################
#The data frame supplied here is the dataframe that has the "name" column dropped.
# We can directly call here the function to give the columns so that their correlation
# is high with target column and has outliers.
exclusive_columns=["PPE","spread1"]
##############################################################################
##############################################################################
#creating a new data frame
#Testing with the data below
#percentile_threshold_tweaking(data_name_dropped,Quartiles_passed=data_name_dropped.describe(),percentile_threshold=(30,60,80))
#Quartiles_data_setting(data_name_dropped,columns_given=exclusive_columns,percentile_threshold_given=(30,60,80))
#print(index_list_for_outliers(data_name_dropped,Quartiles_given_by_user=Quartiles_data_setting(data_name_dropped,columns_given=exclusive_columns)))
outlier_data_frame=pd.DataFrame(index_list_for_outliers(data_name_dropped,Quartiles_given_by_user=Quartiles_data_setting(data_name_dropped,columns_given=exclusive_columns)))
Let us now analyze how many outliers are actually there and where they are scattered.
outlier_data_frame.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 98 entries, 0 to 97 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PPE_Q3 49 non-null float64 1 spread1_Q3 49 non-null float64 dtypes: float64(2) memory usage: 1.7 KB
outlier_data_frame
| PPE_Q3 | spread1_Q3 | |
|---|---|---|
| 0 | 0.0 | NaN |
| 1 | 1.0 | NaN |
| 2 | 2.0 | NaN |
| 3 | 3.0 | NaN |
| 4 | 4.0 | NaN |
| ... | ... | ... |
| 93 | NaN | 153.0 |
| 94 | NaN | 154.0 |
| 95 | NaN | 155.0 |
| 96 | NaN | 157.0 |
| 97 | NaN | 164.0 |
98 rows × 2 columns
So, there are 98 outliers in total combining both the columns of interest. We can check this with any other column by just modifying the data. The Outliers that are overlapping in both of the columns might be very interesting to look at. We need to have an indexed location of those entries which have outliers in both "PPE" column and "spread1" column. That might reveal some surprising insights.
Please Note : We are not developing a new function here for generalizing this. We will do that later, since it is much simpler this way. Also because it is completely unnecessary.
#The Number of outliers for PPE column that could influence strongly on our prediction.
outlier1_Q3=outlier_data_frame["PPE_Q3"].count()
outlier2_Q3=outlier_data_frame["spread1_Q3"].count()
total_count=data_name_dropped.shape[0]
print("The Number of Outliers on Q3 side of 'PPE' column is :",outlier1_Q3)
# Checking the above details, we have to observe that there are only two columns.
# According to the code you will get different columns for outliers on either side of the one
#standard deviation. This along with the boxplot will show that the outliers are extremely skewed
#into one side of the spectrum.
print("The Number of Outliers on Q3 side of 'spread1' column is :",outlier2_Q3)
print("The Total Number of Outliers to the data combined :",outlier2_Q3+outlier1_Q3)
print("The Total proportion of the outliers (in %) to the original data :",(outlier2_Q3+outlier1_Q3)*100/total_count)
The Number of Outliers on Q3 side of 'PPE' column is : 49 The Number of Outliers on Q3 side of 'spread1' column is : 49 The Total Number of Outliers to the data combined : 98 The Total proportion of the outliers (in %) to the original data : 50.256410256410255
Since these outliers are significant in number, we have to do something about them. There might also be some repeated entries, some just don't try to fit in. We need further analysis and setup to deal with this in detail. So, lets get into this.
We can't simply eliminate them, neither can be we replace them with average values, because if we do so, their weightage and correlation with other columns will be upset and will upset the overall effectiveness of our chosen model. It is acceptable to repalce NaN values with median or mean but not extreme values. Fit Transform is also not an option, except to eliminate them.
Let us now try to eliminate the extreemities that are shared between both the columns i.e. extremities in both PPE column and spread1 column. But before that we will check how they are both distributed with respect to each other.
sns.jointplot(data_name_dropped["PPE"],data_name_dropped["spread1"],kind="reg")
<seaborn.axisgrid.JointGrid at 0x1e9e0f82460>
There seems to be a very good correlation between both the features, with so much overlapping. That means they are very highly correlated. Let us recheck it again.
#Original Dataframe in which we have to measure the correlation between the two interested Quantities
data_name_dropped[["PPE","spread1"]].corr()
| PPE | spread1 | |
|---|---|---|
| PPE | 1.000000 | 0.962435 |
| spread1 | 0.962435 | 1.000000 |
There is a huge correlation between both. So, let us now proceed to check if those share the same index values. If so that means those outliers are a bigger influence than anything else on the actual prediction of the data.
#Here we have generated a function that takes the dataframe generated by
#our earlier efforts to generate a dataframe from the given choice of correlation
#value and a target column.
##################################################################
####################################################################
#####################################################################
####################################################################
####################################################################
#This gives us the index locations as a list of those values which are outliers in both the columns (or any number)
# in our context. This can be used to generate a drop funciton.
def repeated_extremum_values_index(outlier_data_frame):
spread_values=[]
index_overlapping_list=[]
data_given=outlier_data_frame.fillna("") #In case if we didn't drop NaN entries in the table
columns_outlier=outlier_data_frame.columns
for col in columns_outlier:
spread_values.append(outlier_data_frame[outlier_data_frame[col].isna()==False][col])
for series1 in spread_values:
for series2 in spread_values:
#print("Begin loop :",series1.name,series2.name,"\n")
if series1.name!=series2.name:
#print(series1.name,series2.name,"\n")
for values in series1:
if values in series2:
index_overlapping_list.append(values)
#print(values)
#print(values)
else:
break
print("Not Equal\n")
#print("End loop\n")
#print(index_overlapping_list)
# One more loop to solve the actual issues
return [int(i) for i in index_overlapping_list]
#repeated_outliers_values_index=repeated_extremum_values_index(outlier_data_frame)
We should drop the NaN values from the dataframe here onwards to make real use of it.
#Just getting a sense of the dataframe once again.
#outlier_data_frame.head(),outlier_data_frame.tail()
#Making a new dataframe dropping the NaN values. Inplace replacement is preferred.
#But making it to the safeside here.
outlier_data_frame_clean=outlier_data_frame.fillna('')
###########################################################
############################################################
#creating a new dataframe as a copy from the original dataframe for further analysis,
# and dropping the new datapoints and checking if it makes any sense to the data.
data_duplicate=data_name_dropped.copy()
repeated_outliers_values_index=repeated_extremum_values_index(outlier_data_frame)
columns_of_interest=target_column_correlation(data_name_dropped,target_column,correlation_threshold_for_target)[0]
print("Here the outliers are total in Number that are in both columns sharing same index entry is :",len(repeated_outliers_values_index))
print("The outliers as a percentage ratio of the total data is :",len(repeated_outliers_values_index)*100/data_name_dropped.shape[0])
Here the outliers are total in Number that are in both columns sharing same index entry is : 13 The outliers as a percentage ratio of the total data is : 6.666666666666667
So just eliminating 13 values from the dataframe won't make a difference where the total entries are around 195. We have a loss of around just 6% of the data points. So, we are sticking to this dataframe and not anything else. Because any more loss of datapoints is simply not acceptable.
########################################################################################
########################################################################################
########################################################################################
#######################################################################################
#######################################################################################
data_duplicate.drop(repeated_outliers_values_index,inplace=True)
data_duplicate.shape
(182, 23)
The dataframe now generated has just 182 entries with extreme values dropped from both the columns of "PPE" and "spread1". Now let us check the correlation between them and compare it with our previous ones, i.e. without dropping.
#making use of the function defined Earlier. This will help us make use of the flexibility
#really well.
data_after_drop=data_duplicate[columns_of_interest]
data_before_drop=data_name_dropped[columns_of_interest]
data_after_drop.corr() #Correlation mapping after dropping those extreme values
| status | spread1 | PPE | |
|---|---|---|---|
| status | 1.000000 | 0.557635 | 0.525655 |
| spread1 | 0.557635 | 1.000000 | 0.958913 |
| PPE | 0.525655 | 0.958913 | 1.000000 |
data_before_drop.corr() #Correlation mapping before dropping those extreme values
| status | spread1 | PPE | |
|---|---|---|---|
| status | 1.000000 | 0.564838 | 0.531039 |
| spread1 | 0.564838 | 1.000000 | 0.962435 |
| PPE | 0.531039 | 0.962435 | 1.000000 |
data_after_drop.corr()-data_before_drop.corr() # The difference is measured here of how much our outliers made the difference.
| status | spread1 | PPE | |
|---|---|---|---|
| status | 0.000000 | -0.007203 | -0.005384 |
| spread1 | -0.007203 | 0.000000 | -0.003523 |
| PPE | -0.005384 | -0.003523 | 0.000000 |
There seems to be not much difference even in the correlation relation, ever after dropping. It only makes a small difference to the status, and seems to be a little decrease in the correlation between PPE and spread1.
sns.pairplot(data_after_drop)
<seaborn.axisgrid.PairGrid at 0x1e9e1085d30>
One's Personal choice is to not loose too many data points in the hunt for eliminating extreme values. There are no Null values or no missing Entry values. We can now go ahead and drop the entries from the given table.
We might as well write some function that would eliminate outliers irrespective of their proper relationship for a given column. Use them later when required. But it is not being executed here.
We Will start with Reinitializing everything and brining it all together.
#generating a bigger index list that will show the outliers of any or all chosen columns and will drop as requested.
# First we will generated columns of our interest here, and restrict ourselves only to the targeted column and correlated
#columns instead of going for whole dataframe.
exclusive_columns=exclusive_columns #borrowing from the previous analysis example, but can be modified here.
Quartiles_choss=Quartiles_data_setting(data_name_dropped,columns_given=exclusive_columns,percentile_threshold_given=(25,50,75))
outlier_data_frame_new=pd.DataFrame(index_list_for_outliers(data_name_dropped,Quartiles_given_by_user=Quartiles_choss))
outlier_data_frame_new=outlier_data_frame_new.fillna("")
#outlier_data_frame_new.dtypes
Note that if we drop one entry that means we are also loosing appropriate legitimate entry in other columns too. So, that would devastate the model and make data as thin as imaginable.
#Here we will develop two functions
#######################################################################
######################################################################
#This will export the index items as a list so that we can drop the particular items from the data entries.
#But it requires a clean "NaN" items deleted dataframe as input
#While supplying the below function with an outlier dataframe, we have to make sure
#that we supply on those columns from it which we actually want to drop.
def index_list_extreme_values(outlier_data_frame):
#print(type(outlier_data_frame))
export_list=[]
for series1 in outlier_data_frame:
#print(series1)
#print("end of ",series1.Name)
for values in outlier_data_frame[series1]:
if values=="''":
pass
#print("here values:",values)
if values in export_list:
pass
else:
export_list.append(values)
while '' in export_list:
export_list.remove('')
return [int(i) for i in export_list]
#######################################################################
#######################################################################
######################################################################
#######################################################################
#######################################################################
# A Very simple function that eliminates the outliers from all the columns chosen
# and conditioned as a supply given.
def drop_the_items_data(dataframe_given,passed_index_list):
dataframe_given=dataframe_given.copy()
#print(dataframe_given)
#print(dataframe_given.drop(passed_index_list))
for ind_value in passed_index_list:
dataframe_given.drop(ind_value,inplace=True)
return dataframe_given
##########################################################################
#########################################################################
##########################################################################
#######################################################################
#######################################################################
indxxx=index_list_extreme_values(outlier_data_frame_new)
data_frame_clean_outliers=drop_the_items_data(data_name_dropped,indxxx)
Let us now compare the dataframes of both dropped extemum ones and the original ones.
data_frame_clean_outliers # Clean dataframe without outliers in those particular chosen columns
| MDVP:Fo(Hz) | MDVP:Fhi(Hz) | MDVP:Flo(Hz) | MDVP:Jitter(%) | MDVP:Jitter(Abs) | MDVP:RAP | MDVP:PPQ | Jitter:DDP | MDVP:Shimmer | MDVP:Shimmer(dB) | ... | Shimmer:DDA | NHR | HNR | status | RPDE | DFA | spread1 | spread2 | D2 | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 120.267 | 137.244 | 114.820 | 0.00333 | 0.00003 | 0.00155 | 0.00202 | 0.00466 | 0.01608 | 0.140 | ... | 0.02337 | 0.00607 | 24.886 | 1 | 0.596040 | 0.764112 | -5.634322 | 0.257682 | 1.854785 | 0.211756 |
| 7 | 107.332 | 113.840 | 104.315 | 0.00290 | 0.00003 | 0.00144 | 0.00182 | 0.00431 | 0.01567 | 0.134 | ... | 0.02487 | 0.00344 | 26.892 | 1 | 0.637420 | 0.763262 | -6.167603 | 0.183721 | 2.064693 | 0.163755 |
| 8 | 95.730 | 132.068 | 91.754 | 0.00551 | 0.00006 | 0.00293 | 0.00332 | 0.00880 | 0.02093 | 0.191 | ... | 0.03218 | 0.01070 | 21.812 | 1 | 0.615551 | 0.773587 | -5.498678 | 0.327769 | 2.322511 | 0.231571 |
| 10 | 88.333 | 112.240 | 84.072 | 0.00505 | 0.00006 | 0.00254 | 0.00330 | 0.00763 | 0.02143 | 0.197 | ... | 0.03237 | 0.01166 | 21.118 | 1 | 0.611137 | 0.776156 | -5.249770 | 0.391002 | 2.407313 | 0.249740 |
| 12 | 136.926 | 159.866 | 131.276 | 0.00293 | 0.00002 | 0.00118 | 0.00153 | 0.00355 | 0.01259 | 0.112 | ... | 0.01968 | 0.00581 | 25.703 | 1 | 0.460600 | 0.646846 | -6.547148 | 0.152813 | 2.041277 | 0.138512 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190 | 174.188 | 230.978 | 94.261 | 0.00459 | 0.00003 | 0.00263 | 0.00259 | 0.00790 | 0.04087 | 0.405 | ... | 0.07008 | 0.02764 | 19.517 | 0 | 0.448439 | 0.657899 | -6.538586 | 0.121952 | 2.657476 | 0.133050 |
| 191 | 209.516 | 253.017 | 89.488 | 0.00564 | 0.00003 | 0.00331 | 0.00292 | 0.00994 | 0.02751 | 0.263 | ... | 0.04812 | 0.01810 | 19.147 | 0 | 0.431674 | 0.683244 | -6.195325 | 0.129303 | 2.784312 | 0.168895 |
| 192 | 174.688 | 240.005 | 74.287 | 0.01360 | 0.00008 | 0.00624 | 0.00564 | 0.01873 | 0.02308 | 0.256 | ... | 0.03804 | 0.10715 | 17.883 | 0 | 0.407567 | 0.655683 | -6.787197 | 0.158453 | 2.679772 | 0.131728 |
| 193 | 198.764 | 396.961 | 74.904 | 0.00740 | 0.00004 | 0.00370 | 0.00390 | 0.01109 | 0.02296 | 0.241 | ... | 0.03794 | 0.07223 | 19.020 | 0 | 0.451221 | 0.643956 | -6.744577 | 0.207454 | 2.138608 | 0.123306 |
| 194 | 214.289 | 260.277 | 77.973 | 0.00567 | 0.00003 | 0.00295 | 0.00317 | 0.00885 | 0.01884 | 0.190 | ... | 0.03078 | 0.04398 | 21.209 | 0 | 0.462803 | 0.664357 | -5.724056 | 0.190667 | 2.555477 | 0.148569 |
141 rows × 23 columns
data_name_dropped # Original dataframe with outliers included.
| MDVP:Fo(Hz) | MDVP:Fhi(Hz) | MDVP:Flo(Hz) | MDVP:Jitter(%) | MDVP:Jitter(Abs) | MDVP:RAP | MDVP:PPQ | Jitter:DDP | MDVP:Shimmer | MDVP:Shimmer(dB) | ... | Shimmer:DDA | NHR | HNR | status | RPDE | DFA | spread1 | spread2 | D2 | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 119.992 | 157.302 | 74.997 | 0.00784 | 0.00007 | 0.00370 | 0.00554 | 0.01109 | 0.04374 | 0.426 | ... | 0.06545 | 0.02211 | 21.033 | 1 | 0.414783 | 0.815285 | -4.813031 | 0.266482 | 2.301442 | 0.284654 |
| 1 | 122.400 | 148.650 | 113.819 | 0.00968 | 0.00008 | 0.00465 | 0.00696 | 0.01394 | 0.06134 | 0.626 | ... | 0.09403 | 0.01929 | 19.085 | 1 | 0.458359 | 0.819521 | -4.075192 | 0.335590 | 2.486855 | 0.368674 |
| 2 | 116.682 | 131.111 | 111.555 | 0.01050 | 0.00009 | 0.00544 | 0.00781 | 0.01633 | 0.05233 | 0.482 | ... | 0.08270 | 0.01309 | 20.651 | 1 | 0.429895 | 0.825288 | -4.443179 | 0.311173 | 2.342259 | 0.332634 |
| 3 | 116.676 | 137.871 | 111.366 | 0.00997 | 0.00009 | 0.00502 | 0.00698 | 0.01505 | 0.05492 | 0.517 | ... | 0.08771 | 0.01353 | 20.644 | 1 | 0.434969 | 0.819235 | -4.117501 | 0.334147 | 2.405554 | 0.368975 |
| 4 | 116.014 | 141.781 | 110.655 | 0.01284 | 0.00011 | 0.00655 | 0.00908 | 0.01966 | 0.06425 | 0.584 | ... | 0.10470 | 0.01767 | 19.649 | 1 | 0.417356 | 0.823484 | -3.747787 | 0.234513 | 2.332180 | 0.410335 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190 | 174.188 | 230.978 | 94.261 | 0.00459 | 0.00003 | 0.00263 | 0.00259 | 0.00790 | 0.04087 | 0.405 | ... | 0.07008 | 0.02764 | 19.517 | 0 | 0.448439 | 0.657899 | -6.538586 | 0.121952 | 2.657476 | 0.133050 |
| 191 | 209.516 | 253.017 | 89.488 | 0.00564 | 0.00003 | 0.00331 | 0.00292 | 0.00994 | 0.02751 | 0.263 | ... | 0.04812 | 0.01810 | 19.147 | 0 | 0.431674 | 0.683244 | -6.195325 | 0.129303 | 2.784312 | 0.168895 |
| 192 | 174.688 | 240.005 | 74.287 | 0.01360 | 0.00008 | 0.00624 | 0.00564 | 0.01873 | 0.02308 | 0.256 | ... | 0.03804 | 0.10715 | 17.883 | 0 | 0.407567 | 0.655683 | -6.787197 | 0.158453 | 2.679772 | 0.131728 |
| 193 | 198.764 | 396.961 | 74.904 | 0.00740 | 0.00004 | 0.00370 | 0.00390 | 0.01109 | 0.02296 | 0.241 | ... | 0.03794 | 0.07223 | 19.020 | 0 | 0.451221 | 0.643956 | -6.744577 | 0.207454 | 2.138608 | 0.123306 |
| 194 | 214.289 | 260.277 | 77.973 | 0.00567 | 0.00003 | 0.00295 | 0.00317 | 0.00885 | 0.01884 | 0.190 | ... | 0.03078 | 0.04398 | 21.209 | 0 | 0.462803 | 0.664357 | -5.724056 | 0.190667 | 2.555477 | 0.148569 |
195 rows × 23 columns
Difference among the correlation of both the dataframes and comparing them here. But note here that we can always go back to tweak, increase or decrease the columns of interest simply by lowering the correlation ratio threshold and probably adding more columns manually.
# Difference among the correlation of both the dataframes and comparing.
relative_difference_corr = data_frame_clean_outliers.corr()-data_name_dropped.corr()
relative_difference_corr
| MDVP:Fo(Hz) | MDVP:Fhi(Hz) | MDVP:Flo(Hz) | MDVP:Jitter(%) | MDVP:Jitter(Abs) | MDVP:RAP | MDVP:PPQ | Jitter:DDP | MDVP:Shimmer | MDVP:Shimmer(dB) | ... | Shimmer:DDA | NHR | HNR | status | RPDE | DFA | spread1 | spread2 | D2 | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MDVP:Fo(Hz) | 0.000000 | -0.036132 | 0.005436 | -0.096288 | -0.212585 | -0.026256 | -0.112944 | -0.026324 | -0.011928 | -0.045749 | ... | 3.060124e-02 | -0.011633 | -0.023219 | 0.016581 | 0.003158 | -0.035858 | -0.139015 | -0.086248 | -0.014688 | -0.166363 |
| MDVP:Fhi(Hz) | -0.036132 | 0.000000 | 0.011322 | -0.091919 | -0.117932 | -0.083292 | -0.063857 | -0.083309 | -0.087382 | -0.115914 | ... | -7.534298e-02 | -0.042398 | 0.014456 | -0.054810 | 0.059431 | -0.028111 | -0.100150 | -0.100442 | -0.068627 | -0.134763 |
| MDVP:Flo(Hz) | 0.005436 | 0.011322 | 0.000000 | -0.395549 | -0.362437 | -0.347562 | -0.427632 | -0.347528 | -0.158513 | -0.202929 | ... | -1.190126e-01 | -0.300323 | 0.141957 | -0.000125 | -0.010371 | -0.081927 | -0.226680 | -0.049208 | -0.049663 | -0.259574 |
| MDVP:Jitter(%) | -0.096288 | -0.091919 | -0.395549 | 0.000000 | -0.064493 | -0.027685 | -0.018851 | -0.027684 | -0.308886 | -0.316618 | ... | -2.945164e-01 | -0.164316 | 0.113744 | -0.043029 | 0.016999 | -0.040974 | -0.161289 | -0.190082 | -0.146376 | -0.190722 |
| MDVP:Jitter(Abs) | -0.212585 | -0.117932 | -0.362437 | -0.064493 | 0.000000 | -0.125290 | -0.046874 | -0.125277 | -0.291501 | -0.275268 | ... | -3.176166e-01 | -0.251870 | 0.165674 | -0.007654 | 0.082771 | 0.033751 | -0.054316 | -0.055393 | -0.161125 | -0.065408 |
| MDVP:RAP | -0.026256 | -0.083292 | -0.347562 | -0.027685 | -0.125290 | 0.000000 | -0.001097 | -0.000002 | -0.279360 | -0.288639 | ... | -2.665937e-01 | -0.229445 | 0.101078 | -0.011667 | -0.011583 | -0.016367 | -0.153855 | -0.230564 | -0.138974 | -0.174065 |
| MDVP:PPQ | -0.112944 | -0.063857 | -0.427632 | -0.018851 | -0.046874 | -0.001097 | 0.000000 | -0.001111 | -0.293195 | -0.311489 | ... | -2.730414e-01 | -0.136182 | 0.049173 | -0.012639 | 0.097628 | -0.091429 | -0.100146 | -0.213129 | -0.138446 | -0.147899 |
| Jitter:DDP | -0.026324 | -0.083309 | -0.347528 | -0.027684 | -0.125277 | -0.000002 | -0.001111 | 0.000000 | -0.279452 | -0.288725 | ... | -2.666630e-01 | -0.229527 | 0.101054 | -0.011628 | -0.011826 | -0.016254 | -0.153837 | -0.230697 | -0.138983 | -0.174043 |
| MDVP:Shimmer | -0.011928 | -0.087382 | -0.158513 | -0.308886 | -0.291501 | -0.279360 | -0.293195 | -0.279452 | 0.000000 | 0.004577 | ... | 2.997933e-03 | -0.185684 | 0.095153 | -0.035936 | -0.016027 | -0.095194 | -0.237934 | -0.187154 | -0.167433 | -0.280165 |
| MDVP:Shimmer(dB) | -0.045749 | -0.115914 | -0.202929 | -0.316618 | -0.275268 | -0.288639 | -0.311489 | -0.288725 | 0.004577 | 0.000000 | ... | 1.248815e-02 | -0.184766 | 0.093157 | -0.026154 | 0.007367 | -0.114282 | -0.244063 | -0.207401 | -0.177600 | -0.292178 |
| Shimmer:APQ3 | 0.030597 | -0.075338 | -0.119010 | -0.294499 | -0.317602 | -0.266568 | -0.273023 | -0.266638 | 0.002995 | 0.012482 | ... | -2.208585e-08 | -0.194197 | 0.095739 | -0.051023 | -0.038991 | -0.080323 | -0.232193 | -0.164111 | -0.145694 | -0.269128 |
| Shimmer:APQ5 | 0.007066 | -0.054797 | -0.159593 | -0.337460 | -0.315971 | -0.290038 | -0.325473 | -0.290084 | 0.000066 | -0.000763 | ... | 1.247695e-02 | -0.142102 | 0.068074 | -0.037713 | 0.011029 | -0.159132 | -0.256627 | -0.213336 | -0.144679 | -0.312004 |
| MDVP:APQ | -0.143203 | -0.115248 | -0.259183 | -0.298241 | -0.182191 | -0.276959 | -0.288649 | -0.277109 | 0.021820 | 0.012742 | ... | 3.767208e-02 | -0.174202 | 0.070026 | 0.029590 | 0.039046 | -0.084813 | -0.177574 | -0.185630 | -0.195320 | -0.230750 |
| Shimmer:DDA | 0.030601 | -0.075343 | -0.119013 | -0.294516 | -0.317617 | -0.266594 | -0.273041 | -0.266663 | 0.002998 | 0.012488 | ... | 0.000000e+00 | -0.194204 | 0.095727 | -0.051032 | -0.038982 | -0.080353 | -0.232207 | -0.164117 | -0.145690 | -0.269149 |
| NHR | -0.011633 | -0.042398 | -0.300323 | -0.164316 | -0.251870 | -0.229445 | -0.136182 | -0.229527 | -0.185684 | -0.184766 | ... | -1.942035e-01 | 0.000000 | 0.026835 | -0.071680 | 0.003590 | -0.072341 | -0.224447 | -0.132824 | -0.059645 | -0.240280 |
| HNR | -0.023219 | 0.014456 | 0.141957 | 0.113744 | 0.165674 | 0.101078 | 0.049173 | 0.101054 | 0.095153 | 0.093157 | ... | 9.572707e-02 | 0.026835 | 0.000000 | 0.070282 | 0.026398 | 0.143882 | 0.111855 | 0.190009 | 0.128984 | 0.131047 |
| status | 0.016581 | -0.054810 | -0.000125 | -0.043029 | -0.007654 | -0.011667 | -0.012639 | -0.011628 | -0.035936 | -0.026154 | ... | -5.103245e-02 | -0.071680 | 0.070282 | 0.000000 | -0.083634 | -0.009556 | 0.010953 | -0.049965 | -0.036967 | 0.031301 |
| RPDE | 0.003158 | 0.059431 | -0.010371 | 0.016999 | 0.082771 | -0.011583 | 0.097628 | -0.011826 | -0.016027 | 0.007367 | ... | -3.898174e-02 | 0.003590 | 0.026398 | -0.083634 | 0.000000 | -0.008619 | -0.006325 | -0.033750 | -0.172502 | 0.041792 |
| DFA | -0.035858 | -0.028111 | -0.081927 | -0.040974 | 0.033751 | -0.016367 | -0.091429 | -0.016254 | -0.095194 | -0.114282 | ... | -8.035332e-02 | -0.072341 | 0.143882 | -0.009556 | -0.008619 | 0.000000 | -0.023483 | -0.001984 | -0.033995 | -0.096514 |
| spread1 | -0.139015 | -0.100150 | -0.226680 | -0.161289 | -0.054316 | -0.153855 | -0.100146 | -0.153837 | -0.237934 | -0.244063 | ... | -2.322074e-01 | -0.224447 | 0.111855 | 0.010953 | -0.006325 | -0.023483 | 0.000000 | -0.126341 | -0.176921 | 0.016920 |
| spread2 | -0.086248 | -0.100442 | -0.049208 | -0.190082 | -0.055393 | -0.230564 | -0.213129 | -0.230697 | -0.187154 | -0.207401 | ... | -1.641169e-01 | -0.132824 | 0.190009 | -0.049965 | -0.033750 | -0.001984 | -0.126341 | 0.000000 | -0.191048 | -0.127495 |
| D2 | -0.014688 | -0.068627 | -0.049663 | -0.146376 | -0.161125 | -0.138974 | -0.138446 | -0.138983 | -0.167433 | -0.177600 | ... | -1.456902e-01 | -0.059645 | 0.128984 | -0.036967 | -0.172502 | -0.033995 | -0.176921 | -0.191048 | 0.000000 | -0.180878 |
| PPE | -0.166363 | -0.134763 | -0.259574 | -0.190722 | -0.065408 | -0.174065 | -0.147899 | -0.174043 | -0.280165 | -0.292178 | ... | -2.691492e-01 | -0.240280 | 0.131047 | 0.031301 | 0.041792 | -0.096514 | 0.016920 | -0.127495 | -0.180878 | 0.000000 |
23 rows × 23 columns
#Mapping the data again into a heatmap
plt.figure(figsize=(25,12))
plt.title("Heat map for relative correlation between the data points ")
sns.heatmap(relative_difference_corr,annot=True)
plt.show()
Now let us take the analysis to next stage by checking the correlation difference now between the target column and others.
# Taking correlation data for new dataframe without outliers for target column
relative_difference_corr["status"]
MDVP:Fo(Hz) 0.016581 MDVP:Fhi(Hz) -0.054810 MDVP:Flo(Hz) -0.000125 MDVP:Jitter(%) -0.043029 MDVP:Jitter(Abs) -0.007654 MDVP:RAP -0.011667 MDVP:PPQ -0.012639 Jitter:DDP -0.011628 MDVP:Shimmer -0.035936 MDVP:Shimmer(dB) -0.026154 Shimmer:APQ3 -0.051023 Shimmer:APQ5 -0.037713 MDVP:APQ 0.029590 Shimmer:DDA -0.051032 NHR -0.071680 HNR 0.070282 status 0.000000 RPDE -0.083634 DFA -0.009556 spread1 0.010953 spread2 -0.049965 D2 -0.036967 PPE 0.031301 Name: status, dtype: float64
Entries_clean=data_frame_clean_outliers.shape[0]
Entries_old=data_name_dropped.shape[0]
print("\nHere the entries are total in Number that are in either of the columns(not an outlier on any of the two chosen columns) is :",Entries_clean)
print("\nThe total outliers that are in either columns is :",Entries_old-Entries_clean)
print("\nThe outliers as a percentage ratio of the total data size is :",(Entries_old-Entries_clean)*100/data_name_dropped.shape[0])
Here the entries are total in Number that are in either of the columns(not an outlier on any of the two chosen columns) is : 141 The total outliers that are in either columns is : 54 The outliers as a percentage ratio of the total data size is : 27.692307692307693
So, the total outliers pushed by the two columns of interest towards the dataframe constitutes around 27 % of the original data. That is more than a Quarter of the data. So, it is going to affect our model in more unpredictable ways than that can be corrected later by addition of more data or neutralized more normal data.
Conclusion : We can't proceed with the data such that outliers in both columns are eliminated, we can only proceed with model building for the data, which has entries cleaned of entries that are outliers in both the columns of interest.
Of couse we can modify this any time we want.
We are going to bring back the dataframe (data_duplicate) we originally developed and create a copy of it, initialize it for model training.
# Importing the data from the previous models and functions and then
#initializing it to a new name so that we can move ahead
df=data_duplicate.copy()
#A bunch of lists for final comparision of the models in a proper manner, and for debugging process.
comparison_of_models_matrix=[]
classification_report_list=[]
confusion_matrix_list=[]
meta_classifier_info=[]
Splitting the Data in the ratio 70:30
Personal Opinion : The data entries are so few in number that perhaps any model we fit into might not have proper predicitons or proper models. So, I am not sure where this is going to lead.
#Developing the variables for analysis
x=df.drop("status",axis=1)
y=df["status"]
#Importing necessary libraries ony when it is required.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score as area_under_curve
#Getting the training and testing data set.
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)
data_set_shape={"x_train":x_train.shape,"y_train":y_train.shape,"x_test":x_test.shape,"y_test":y_test.shape}
data_set_shape
#Converting Pandas Series to Dataframes Anew
#y_train=pd.DataFrame(x_train)
#print(x_train)
#y_test=pd.DataFrame(x_test)
#print(x_test)
#This is the data generated so that it can be used for further anlaysis and modification if the need arises to look
# back upon our test sets. But I have not used it here in this project.
{'x_train': (127, 22), 'y_train': (127,), 'x_test': (55, 22), 'y_test': (55,)}
Note : Before we go any further we might want to consider the option of eliminating and scaling such that data is properly trained in the model. I am not going to do it but generate a function that will do it for us.
#Defining the new function here
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#choose a scaler from here or import them properly and then use it.
scaler=StandardScaler()
#scaler=MinMaxScaler()
#We can experiment with as many types of scaler as we can and see the
#various kinds of effects they have on the data
###########################################################################
##########################################################################
#############################################################################
##########################################################################
############################################################################
def transform_data(train_data,scaler_chosen=False):
#print(scaler_chosen)
if scaler_chosen==False:
#print("yes")
return train_data
else:
#print("no")
temp=pd.DataFrame(scaler_chosen.fit_transform(train_data),index=train_data.index,columns=train_data.columns)
return temp
#############################################################################
#############################################################################
#############################################################################
#############################################################################
#############################################################################
#Here I have chosen not to fit transform the data. But if you do want to transform
#the data to fit, just import the appropriate scaler, adjust parameters scaler_chosen
# to the scaler and proceed to transform the data and then
#leave it there.
x_train=transform_data(x_train,scaler_chosen=False) #Here select scaler as fit_transform()
x_test=transform_data(x_test,scaler_chosen=False) #Here select the scaler as MinMaxScaler()
#type(x_train)
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# Fit the model on train
LR_model = LogisticRegression(solver="liblinear")
LR_model.fit(x_train, y_train)
#predict on test
predict_LR = LR_model.predict(x_test)
confusion_LR=metrics.confusion_matrix(y_test, predict_LR, labels=[1, 0])
#coef_df
coef_df_LR = pd.DataFrame(LR_model.coef_)
coef_df_LR['intercept'] = LR_model.intercept_
#Testing the Scores of the Model
Accuracy_LR = accuracy_score(y_test, predict_LR)
Recall_LR = recall_score(y_test, predict_LR)
LR_Testscore=LR_model.score(x_test,y_test)
LR_Trainscore=LR_model.score(x_train,y_train)
auc=area_under_curve(y_test,predict_LR)
#print(auc)
comparison_of_models_matrix.append({"Model":"Logistic Regression","Accuracy":Accuracy_LR,"Recall":Recall_LR,"Train_Score":LR_Trainscore,"Test_Score":LR_Testscore,"Area_Under_Curve":auc})
classification_report_list.append({"Logistic_Regression":classification_report(y_test, predict_LR, labels=[1, 0])})
confusion_matrix_list.append(confusion_LR)
print("\nLogistic Regression Model Score for Train Set : ", LR_Trainscore)
print("\nLogistic Regression Model Score for Test Set : ", LR_Testscore)
df_confusion_LR = pd.DataFrame(confusion_LR, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nLogistic Regression Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_LR, annot=True)
#plt.clf()
#s = [['TN','FP'], ['FN', 'TP']]
#for i in range(2):
#for j in range(2):
#plt.text(j,i, str(s[i][j])+" = "+str(confusion_LR[i][j]))
#plt.show()
Logistic Regression Model Score for Train Set : 0.8740157480314961 Logistic Regression Model Score for Test Set : 0.8545454545454545 Logistic Regression Mapping
<AxesSubplot:>
Observation : Model Score doesn't seem to be bad, but it is not above 90%, so it might be okay.
#coef_df_LR #Coefficient Matrix for Logistic Regression
df_confusion_LR #Printing the confusion Matrix
| Predict Yes | Predict No | |
|---|---|---|
| PD | 36 | 2 |
| Non-PD | 6 | 11 |
#We are directly printing the results.
print("Classification Report\n")
print(classification_report(y_test, predict_LR, labels=[1, 0]))
print("\nThe Accuracy Score of Logistic Regression Model is : ",Accuracy_LR)
print("\nThe Recall Score of Logistic Regression Model is : ",Recall_LR)
Classification Report
precision recall f1-score support
1 0.86 0.95 0.90 38
0 0.85 0.65 0.73 17
accuracy 0.85 55
macro avg 0.85 0.80 0.82 55
weighted avg 0.85 0.85 0.85 55
The Accuracy Score of Logistic Regression Model is : 0.8545454545454545
The Recall Score of Logistic Regression Model is : 0.9473684210526315
Now We are going for Naive Bayes
from sklearn.naive_bayes import GaussianNB
NB_model = GaussianNB()
#Fitting the model for now
NB_model.fit(x_train, y_train.ravel())
#diab_train_predict = diab_model.predict(x_train)
#print(diab_model.score(x_test,y_test))
#Confusion Matrix Checking
predict_NB = NB_model.predict(x_test)
confusion_Bayes=metrics.confusion_matrix(y_test,predict_NB, labels=[1, 0])
NBB_Trainscore=NB_model.score(x_train,y_train)
NBB_Testscore=NB_model.score(x_test,y_test)
Accuracy_Bayes = accuracy_score(y_test, predict_NB)
Recall_Bayes = recall_score(y_test, predict_NB)
auc_NB=area_under_curve(y_test,predict_NB)
comparison_of_models_matrix.append({"Model":"Naive Bayes","Accuracy":Accuracy_Bayes,"Recall":Recall_Bayes,"Train_Score":NBB_Trainscore,"Test_Score":NBB_Testscore,"Area_Under_Curve":auc_NB})
classification_report_list.append({"Naive_Bayes":classification_report(y_test, predict_NB, labels=[1, 0])})
confusion_matrix_list.append(confusion_Bayes)
print("\nNaive Bayes Model Train Score :",NBB_Trainscore)
print("\nNaive Bayes Model Test Score :",NBB_Testscore)
df_confusion_Bayes = pd.DataFrame(confusion_Bayes, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nNaive Bayes Model Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_Bayes, annot=True)
Naive Bayes Model Train Score : 0.6850393700787402 Naive Bayes Model Test Score : 0.6363636363636364 Naive Bayes Model Mapping
<AxesSubplot:>
#checking Confusion Matrix
df_confusion_Bayes
| Predict Yes | Predict No | |
|---|---|---|
| PD | 19 | 19 |
| Non-PD | 1 | 16 |
from sklearn.metrics import classification_report
print("Classification Report\n")
print(classification_report(y_test, predict_NB, labels=[1, 0]))
print("\nThe Accuracy Score of Naive Bayes Model is : ",Accuracy_Bayes)
print("\nThe Recall Score of Naive Bayes Model is : ",Recall_Bayes)
Classification Report
precision recall f1-score support
1 0.95 0.50 0.66 38
0 0.46 0.94 0.62 17
accuracy 0.64 55
macro avg 0.70 0.72 0.64 55
weighted avg 0.80 0.64 0.64 55
The Accuracy Score of Naive Bayes Model is : 0.6363636363636364
The Recall Score of Naive Bayes Model is : 0.5
Now Let us go for KNN for training the data.
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance') #Change the Number of neighbours for better modeling
KNN.fit(x_train, y_train)
predict_KNN = KNN.predict(x_test)
confusion_KNN=metrics.confusion_matrix(y_test, predict_KNN, labels=[1, 0])
Accuracy_KNN = accuracy_score(y_test, predict_KNN)
Recall_KNN = recall_score(y_test, predict_KNN)
KNN_Testscore=KNN.score(x_test,y_test)
KNN_Trainscore=KNN.score(x_train,y_train)
auc_KNN=area_under_curve(y_test,predict_KNN)
print("\nKNN Model Train Score :",KNN_Trainscore)
print("\nKNN Model Test Score :",KNN_Testscore)
comparison_of_models_matrix.append({"Model":"KNN","Accuracy":Accuracy_KNN,"Recall":Recall_KNN,"Train_Score":KNN_Trainscore,"Test_Score":KNN_Testscore,"Area_Under_Curve":auc_KNN})
classification_report_list.append({"KNN":classification_report(y_test, predict_KNN, labels=[1, 0])})
confusion_matrix_list.append(confusion_KNN)
df_confusion_KNN = pd.DataFrame(confusion_KNN, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nConfusion Matrix Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_KNN, annot=True)
KNN Model Train Score : 1.0 KNN Model Test Score : 0.8727272727272727 Confusion Matrix Mapping
<AxesSubplot:>
df_confusion_KNN
| Predict Yes | Predict No | |
|---|---|---|
| PD | 36 | 2 |
| Non-PD | 5 | 12 |
print("Classification Report\n")
print(classification_report(y_test, predict_KNN, labels=[1, 0]))
print("\nThe Accuracy Score of K-Nearest Neighbor Model is : ",Accuracy_KNN)
print("\nThe Recall Score of K-Nearest Neighbor Model is : ",Recall_KNN)
Classification Report
precision recall f1-score support
1 0.88 0.95 0.91 38
0 0.86 0.71 0.77 17
accuracy 0.87 55
macro avg 0.87 0.83 0.84 55
weighted avg 0.87 0.87 0.87 55
The Accuracy Score of K-Nearest Neighbor Model is : 0.8727272727272727
The Recall Score of K-Nearest Neighbor Model is : 0.9473684210526315
Let us now jump to SVM model and see how the scores are. We can take a decision for fit transforming the data for fitting the data after all the models have been evaluated.
from sklearn.svm import SVC
support_vector = SVC(C=10000,probability=True)
support_vector.fit(x_train, y_train)
predict_SVM = support_vector.predict(x_test)
confusion_SVM=metrics.confusion_matrix(y_test, predict_SVM, labels=[1, 0])
Accuracy_SVM = accuracy_score(y_test, predict_SVM)
Recall_SVM = recall_score(y_test, predict_SVM)
support_vector_Testscore=support_vector.score(x_test, y_test)
support_vector_Trainscore=support_vector.score(x_train, y_train)
auc_SVM=area_under_curve(y_test,predict_SVM)
print("\nSupport Vector Machine Model Test Score :",support_vector_Testscore)
print("\nSupport Vector Machine Model Train Score :",support_vector_Trainscore)
comparison_of_models_matrix.append({"Model":"Support Vector Machine","Accuracy":Accuracy_SVM,"Recall":Recall_SVM,"Train_Score":support_vector_Trainscore,"Test_Score":support_vector_Testscore,"Area_Under_Curve":auc_SVM})
classification_report_list.append({"SVM":classification_report(y_test, predict_SVM, labels=[1, 0])})
confusion_matrix_list.append(confusion_SVM)
meta_classifier_info.append({"SVM":{"value":support_vector.predict_proba(x_test)[0],"probability":support_vector.predict_proba(x_test)[1]}})
df_confusion_SVM = pd.DataFrame(confusion_SVM, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nConfusion Matrix Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_SVM, annot=True)
Support Vector Machine Model Test Score : 0.7818181818181819 Support Vector Machine Model Train Score : 0.937007874015748 Confusion Matrix Mapping
<AxesSubplot:>
#support_vector.predict_proba(x_test)[:,1]
#pd.DataFrame(meta_classifier_info[0])
#pd.Series(meta_classifier_info[0]["value"])
#meta_classifier_info
df_confusion_SVM
| Predict Yes | Predict No | |
|---|---|---|
| PD | 35 | 3 |
| Non-PD | 9 | 8 |
print("Classification Report\n")
print(classification_report(y_test, predict_SVM, labels=[1, 0]))
#comparison_of_models_matrix.append({"Model":"Support Vector Machine","Accuracy":Accuracy_SVM,"Recall":Recall_SVM,"Train_Score":support_vector_Trainscore,"Test_Score":support_vector_Testscore})
#classification_report_list.append(["Support Vector Machine",classification_report(y_test, y_predict, labels=[1, 0])])
print("\nThe Accuracy Score of Support Vector Machine Model is : ",Accuracy_SVM)
print("\nThe Recall Score of Support Vector Machine Model is : ",Recall_SVM)
Classification Report
precision recall f1-score support
1 0.80 0.92 0.85 38
0 0.73 0.47 0.57 17
accuracy 0.78 55
macro avg 0.76 0.70 0.71 55
weighted avg 0.77 0.78 0.77 55
The Accuracy Score of Support Vector Machine Model is : 0.7818181818181819
The Recall Score of Support Vector Machine Model is : 0.9210526315789473
pd.DataFrame(comparison_of_models_matrix)
| Model | Accuracy | Recall | Train_Score | Test_Score | Area_Under_Curve | |
|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.854545 | 0.947368 | 0.874016 | 0.854545 | 0.797214 |
| 1 | Naive Bayes | 0.636364 | 0.500000 | 0.685039 | 0.636364 | 0.720588 |
| 2 | KNN | 0.872727 | 0.947368 | 1.000000 | 0.872727 | 0.826625 |
| 3 | Support Vector Machine | 0.781818 | 0.921053 | 0.937008 | 0.781818 | 0.695820 |
#Once importing the necessary importing is done.
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
from os import system
from IPython.display import Image
decision_tree=DecisionTreeClassifier(criterion = 'entropy')
decision_tree.fit(x_train, y_train)
predict_DT = decision_tree.predict(x_test)
confusion_DT=confusion_matrix(y_test, predict_DT, labels=[0, 1])
Accuracy_DT = accuracy_score(y_test, predict_DT)
Recall_DT = recall_score(y_test, predict_DT)
#Works only if "dot" command works on you machine
decision_tree_Testscore=decision_tree.score(x_test, y_test)
decision_tree_Trainscore=decision_tree.score(x_train, y_train)
auc_DT=area_under_curve(y_test,predict_DT)
#comparison_of_models_matrix.remove({"Model":"Decision Tree","Accuracy":Accuracy_DT,"Recall":Recall_DT,"Train_Score":decision_tree_Trainscore,"Test_Score":decision_tree_Testscore})
#classification_report_list.append(["KNN",classification_report(y_test, y_predict, labels=[1, 0])])
comparison_of_models_matrix.append({"Model":"Decision Tree","Accuracy":Accuracy_DT,"Recall":Recall_DT,"Train_Score":decision_tree_Trainscore,
"Test_Score":decision_tree_Testscore,"Area_Under_Curve":auc_DT})
classification_report_list.append({"Decision_Tree":classification_report(y_test, predict_DT, labels=[1, 0])})
confusion_matrix_list.append(confusion_DT)
df_confusion_DT = pd.DataFrame(confusion_DT, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nDecision Tree Model Test Score :",decision_tree_Testscore)
print("\nDecision Tree Model Train Score :",decision_tree_Trainscore)
print("\nConfusion Matrix Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_DT, annot=True)
Decision Tree Model Test Score : 0.6727272727272727 Decision Tree Model Train Score : 1.0 Confusion Matrix Mapping
<AxesSubplot:>
Model seems to be completely overfit with so much high train score. This needs further modification of the data to fit into the picture.
df_confusion_DT
| Predict Yes | Predict No | |
|---|---|---|
| PD | 7 | 10 |
| Non-PD | 8 | 30 |
#Working towards some image processing and display mechanism to see the tree that
#actually has been designed. I haven't modified much from the sample assignments.
train_char_label = ['No', 'Yes']
Credit_Tree_File = open('credit_tree.dot','w')
decision_tree_data = export_graphviz(decision_tree, out_file=Credit_Tree_File, feature_names = list(x_train), class_names = list(train_char_label))
Credit_Tree_File.close()
retCode = system("dot -Tpng credit_tree.dot -o credit_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("credit_tree.png"))
If graphviz doesn't work, we can use plot_tree method from sklearn.tree. Make this cell into code
import matplotlib.pyplot as plt from sklearn.tree import plot_tree
feature_train_names = list(x_train) class_names_chosen = ['No', 'Yes'] fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=300) plot_tree(decision_tree, feature_names = feature_train_names, class_names=class_names_chosen, filled = True)
fig.savefig('tree.png')
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature.
#It is also known as the Gini importance )
decision_tree_importances = pd.DataFrame(decision_tree.feature_importances_, columns = ["Importance"], index = x_train.columns)
decision_tree_importances
| Importance | |
|---|---|
| MDVP:Fo(Hz) | 0.145663 |
| MDVP:Fhi(Hz) | 0.035449 |
| MDVP:Flo(Hz) | 0.000000 |
| MDVP:Jitter(%) | 0.108035 |
| MDVP:Jitter(Abs) | 0.000000 |
| MDVP:RAP | 0.000000 |
| MDVP:PPQ | 0.000000 |
| Jitter:DDP | 0.027055 |
| MDVP:Shimmer | 0.000000 |
| MDVP:Shimmer(dB) | 0.000000 |
| Shimmer:APQ3 | 0.053576 |
| Shimmer:APQ5 | 0.000000 |
| MDVP:APQ | 0.319047 |
| Shimmer:DDA | 0.000000 |
| NHR | 0.085920 |
| HNR | 0.000000 |
| RPDE | 0.000000 |
| DFA | 0.027055 |
| spread1 | 0.000000 |
| spread2 | 0.000000 |
| D2 | 0.122144 |
| PPE | 0.076057 |
Inference :
#Finally generating a Classification report and appending them to our analytic dataframe.
print("Classification Report\n")
print(classification_report(y_test, predict_DT, labels=[1, 0]))
print("\nThe Accuracy Score of Decision Tree Model is : ",Accuracy_DT)
print("\nThe Recall Score of Decision Tree Model is : ",Recall_DT)
Classification Report
precision recall f1-score support
1 0.75 0.79 0.77 38
0 0.47 0.41 0.44 17
accuracy 0.67 55
macro avg 0.61 0.60 0.60 55
weighted avg 0.66 0.67 0.67 55
The Accuracy Score of Decision Tree Model is : 0.6727272727272727
The Recall Score of Decision Tree Model is : 0.7894736842105263
We will try to do some fit transformation now for the rest of the models.
from sklearn.ensemble import BaggingClassifier
# I am not setting a base estimator here because of overfitting
bagging=BaggingClassifier(n_estimators=50,random_state=1)
bagging.fit(x_train, y_train)
predict_bagging = bagging.predict(x_test)
confusion_Bg=confusion_matrix(y_test, predict_bagging, labels=[0, 1])
Accuracy_Bagging = accuracy_score(y_test, predict_bagging)
Recall_Bagging = recall_score(y_test, predict_bagging)
#Works only if "dot" command works on you machine
bagging_Testscore=bagging.score(x_test, y_test)
bagging_Trainscore=bagging.score(x_train, y_train)
auc_bagging=area_under_curve(y_test,predict_bagging)
#comparison_of_models_matrix.remove({"Model":"Decision Tree","Accuracy":Accuracy_DT,"Recall":Recall_DT,"Train_Score":decision_tree_Trainscore,"Test_Score":decision_tree_Testscore})
#classification_report_list.append(["KNN",classification_report(y_test, y_predict, labels=[1, 0])])
comparison_of_models_matrix.append({"Model":"Bagging","Accuracy":Accuracy_Bagging,"Recall":Recall_Bagging,"Train_Score":bagging_Trainscore,
"Test_Score":bagging_Testscore,"Area_Under_Curve":auc_bagging})
classification_report_list.append({"Bagging":classification_report(y_test, predict_bagging, labels=[1, 0])})
confusion_matrix_list.append(confusion_Bg)
df_confusion_Bg = pd.DataFrame(confusion_Bg, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nBagging Tree Model Test Score :",bagging_Testscore)
print("\nBagging Tree Model Train Score :",bagging_Trainscore)
print("\nConfusion Matrix Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_Bg, annot=True)
Bagging Tree Model Test Score : 0.9272727272727272 Bagging Tree Model Train Score : 1.0 Confusion Matrix Mapping
<AxesSubplot:>
df_confusion_Bg
| Predict Yes | Predict No | |
|---|---|---|
| PD | 13 | 4 |
| Non-PD | 0 | 38 |
#Finally generating a Classification report and appending them to our analytic dataframe.
print("Classification Report\n")
print(classification_report(y_test, predict_bagging, labels=[1, 0]))
print("\nThe Accuracy Score of Ensemble Bagging Model is : ",Accuracy_Bagging)
print("\nThe Recall Score of Ensemble Bagging Model is : ",Recall_Bagging)
Classification Report
precision recall f1-score support
1 0.90 1.00 0.95 38
0 1.00 0.76 0.87 17
accuracy 0.93 55
macro avg 0.95 0.88 0.91 55
weighted avg 0.93 0.93 0.92 55
The Accuracy Score of Ensemble Bagging Model is : 0.9272727272727272
The Recall Score of Ensemble Bagging Model is : 1.0
from sklearn.ensemble import AdaBoostClassifier
# I am not setting a base estimator here because of overfitting
adaboost=AdaBoostClassifier(n_estimators=10,random_state=1)
adaboost.fit(x_train, y_train)
predict_adaboost = adaboost.predict(x_test)
confusion_AB=confusion_matrix(y_test, predict_adaboost, labels=[0, 1])
Accuracy_adaboost = accuracy_score(y_test, predict_adaboost)
Recall_adaboost = recall_score(y_test, predict_adaboost)
adaboost_Testscore=adaboost.score(x_test, y_test)
adaboost_Trainscore=adaboost.score(x_train, y_train)
auc_adaboost=area_under_curve(y_test,predict_adaboost)
comparison_of_models_matrix.append({"Model":"AdaBoosting","Accuracy":Accuracy_adaboost,"Recall":Recall_adaboost,"Train_Score":adaboost_Trainscore,"Test_Score":adaboost_Testscore,"Area_Under_Curve":auc_adaboost})
classification_report_list.append({"AdaBoosting":classification_report(y_test, predict_adaboost, labels=[1, 0])})
confusion_matrix_list.append(confusion_AB)
df_confusion_AB = pd.DataFrame(confusion_AB, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nBagging Model Test Score :",adaboost_Testscore)
print("\nBagging Model Train Score :",adaboost_Trainscore)
print("\nConfusion Matrix Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_AB, annot=True)
Bagging Model Test Score : 0.8727272727272727 Bagging Model Train Score : 0.968503937007874 Confusion Matrix Mapping
<AxesSubplot:>
df_confusion_AB
| Predict Yes | Predict No | |
|---|---|---|
| PD | 13 | 4 |
| Non-PD | 3 | 35 |
#Finally generating a Classification report and appending them to our analytic dataframe.
print("Classification Report\n")
print(classification_report(y_test, predict_adaboost, labels=[1, 0]))
print("\nThe Accuracy Score of Ensemble Adaboost Model is : ",Accuracy_adaboost)
print("\nThe Recall Score of Ensemble Adaboost Model is : ",Recall_adaboost)
Classification Report
precision recall f1-score support
1 0.90 0.92 0.91 38
0 0.81 0.76 0.79 17
accuracy 0.87 55
macro avg 0.85 0.84 0.85 55
weighted avg 0.87 0.87 0.87 55
The Accuracy Score of Ensemble Adaboost Model is : 0.8727272727272727
The Recall Score of Ensemble Adaboost Model is : 0.9210526315789473
from sklearn.ensemble import GradientBoostingClassifier
gradientboost = GradientBoostingClassifier(n_estimators = 10,random_state=1)
gradientboost = gradientboost.fit(x_train, y_train)
predict_gradientboost = gradientboost.predict(x_test)
confusion_GB=confusion_matrix(y_test, predict_gradientboost, labels=[0, 1])
Accuracy_gradientboost = accuracy_score(y_test, predict_gradientboost)
Recall_gradientboost = recall_score(y_test, predict_gradientboost)
gradientboost_Testscore=gradientboost.score(x_test, y_test)
gradientboost_Trainscore=gradientboost.score(x_train, y_train)
auc_gradientboost=area_under_curve(y_test,predict_gradientboost)
#comparison_of_models_matrix.remove({"Model":"Decision Tree","Accuracy":Accuracy_DT,"Recall":Recall_DT,"Train_Score":decision_tree_Trainscore,"Test_Score":decision_tree_Testscore})
#classification_report_list.append(["KNN",classification_report(y_test, y_predict, labels=[1, 0])])
comparison_of_models_matrix.append({"Model":"GradientBoost","Accuracy":Accuracy_gradientboost,"Recall":Recall_gradientboost,"Train_Score":gradientboost_Trainscore,"Test_Score":gradientboost_Testscore,"Area_Under_Curve":auc_gradientboost})
classification_report_list.append(["GradientBoost",classification_report(y_test, predict_gradientboost, labels=[1, 0])])
confusion_matrix_list.append(confusion_GB)
df_confusion_GB = pd.DataFrame(confusion_GB, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nGradient Boost Model Test Score :",gradientboost_Testscore)
print("\nGradient Boost Model Train Score :",gradientboost_Trainscore)
print("\nConfusion Matrix Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_GB, annot=True)
Gradient Boost Model Test Score : 0.8727272727272727 Gradient Boost Model Train Score : 0.984251968503937 Confusion Matrix Mapping
<AxesSubplot:>
df_confusion_GB
| Predict Yes | Predict No | |
|---|---|---|
| PD | 13 | 4 |
| Non-PD | 3 | 35 |
#Finally generating a Classification report and appending them to our analytic dataframe.
print("Classification Report\n")
print(classification_report(y_test, predict_gradientboost, labels=[1, 0]))
print("\nThe Accuracy Score of Ensemble Gradient Boost Model is : ",Accuracy_gradientboost)
print("\nThe Recall Score of Ensemble Gradient Boost Model is : ",Recall_gradientboost)
Classification Report
precision recall f1-score support
1 0.90 0.92 0.91 38
0 0.81 0.76 0.79 17
accuracy 0.87 55
macro avg 0.85 0.84 0.85 55
weighted avg 0.87 0.87 0.87 55
The Accuracy Score of Ensemble Gradient Boost Model is : 0.8727272727272727
The Recall Score of Ensemble Gradient Boost Model is : 0.9210526315789473
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
random_forest = random_forest.fit(x_train, y_train)
predict_random_forest = random_forest.predict(x_test)
confusion_RF=confusion_matrix(y_test, predict_random_forest, labels=[0, 1])
Accuracy_random_forest = accuracy_score(y_test, predict_random_forest)
Recall_random_forest = recall_score(y_test, predict_random_forest)
random_forest_Testscore=random_forest.score(x_test, y_test)
random_forest_Trainscore=random_forest.score(x_train, y_train)
auc_random_forest=area_under_curve(y_test,predict_random_forest)
#comparison_of_models_matrix.remove({"Model":"Decision Tree","Accuracy":Accuracy_DT,"Recall":Recall_DT,"Train_Score":decision_tree_Trainscore,"Test_Score":decision_tree_Testscore})
#classification_report_list.append(["KNN",classification_report(y_test, y_predict, labels=[1, 0])])
comparison_of_models_matrix.append({"Model":"Random Forest","Accuracy":Accuracy_random_forest,
"Recall":Recall_random_forest,"Train_Score":random_forest_Trainscore,"Test_Score":random_forest_Testscore,"Area_Under_Curve":auc_random_forest})
classification_report_list.append(["RandomForest",classification_report(y_test, predict_random_forest, labels=[1, 0])])
confusion_matrix_list.append(confusion_RF)
df_confusion_RF = pd.DataFrame(confusion_RF, index = [i for i in ["PD","Non-PD"]],
columns = [i for i in ["Predict Yes","Predict No"]])
print("\nRandom Forest Model Test Score :",random_forest_Testscore)
print("\nRandom Forest Model Train Score :",random_forest_Trainscore)
print("\nConfusion Matrix Mapping")
plt.figure(figsize = (7,5))
sns.heatmap(df_confusion_RF, annot=True)
Random Forest Model Test Score : 0.9272727272727272 Random Forest Model Train Score : 1.0 Confusion Matrix Mapping
<AxesSubplot:>
df_confusion_GB
| Predict Yes | Predict No | |
|---|---|---|
| PD | 13 | 4 |
| Non-PD | 3 | 35 |
#Finally generating a Classification report and appending them to our analytic dataframe.
print("Classification Report\n")
print(classification_report(y_test, predict_random_forest, labels=[1, 0]))
print("\nThe Accuracy Score of Random Forest Classifier Model is : ",Accuracy_random_forest)
print("\nThe Recall Score of Random Forest Classifier Model is : ",Recall_random_forest)
Classification Report
precision recall f1-score support
1 0.93 0.97 0.95 38
0 0.93 0.82 0.87 17
accuracy 0.93 55
macro avg 0.93 0.90 0.91 55
weighted avg 0.93 0.93 0.93 55
The Accuracy Score of Random Forest Classifier Model is : 0.9272727272727272
The Recall Score of Random Forest Classifier Model is : 0.9736842105263158
#Some final modifications because of any mis classificaitons happening.
#We can do this all while originally constructing this data frame,
#but that would complicate and make things unnecessary.
#Model_comparison=pd.DataFrame(comparison_of_models_matrix,index=Model)
Model_comparison=pd.DataFrame(comparison_of_models_matrix)
Model_comparison.drop_duplicates(subset="Model",inplace=True)
#Model_comparison.set_index("Model",inplace=True)
#Model_comparison.index
#Model_comparison
# We are reiterating here to make sure that everything is good and safe and the original copy is not modified.
confusion_matrix_taken=confusion_matrix_list
##########################################################################
########################################################################
########################################################################
########################################################################
#########################################################################
#This function takes a list of confusion matrices from all the models and
#gives out a nice dataframe. We can add further functions to this to create
# and add new featues which is given as a comment here. Define the new
# functions that return a pandas series given the confusion matrix, we can
# always create new ways.
def confusion_matrix_df(confusion_matrix_given):
precision=[]
confusion_df_all=np.array(confusion_matrix_given)
columns_df_labels =["TP","FP","FN","TN"]
confusion_df_all.resize(len(confusion_matrix_given),4)
print(confusion_df_all)
confusion_df_all=np.array(confusion_matrix_taken)
confusion_df_all.resize(len(confusion_matrix_taken),4)
for entries in confusion_df_all:
precision.append(entries[0]/(entries[0]+entries[1])) #have to recheck the formula
confusion_df_all=pd.DataFrame(confusion_df_all,columns=columns_df_labels)
confusion_df_all["Precision"]=pd.Series(precision)
return confusion_df_all
#################################################################################
##################################################################################
#################################################################################
#######################################################################################3
##############################################################################3##
#Checking for the info
#confusion_matrix_df(confusion_matrix_taken)
# We are joining the Confusion matrix dataframe with the Model Score
# dataframe to create a complete picture and decide what we need to do.
Model_complete=Model_comparison.join(confusion_matrix_df(confusion_matrix_list))
[[36 2 6 11] [19 19 1 16] [36 2 5 12] [35 3 9 8] [ 7 10 8 30] [13 4 0 38] [13 4 3 35] [13 4 3 35] [14 3 1 37]]
Model_complete
| Model | Accuracy | Recall | Train_Score | Test_Score | Area_Under_Curve | TP | FP | FN | TN | Precision | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.854545 | 0.947368 | 0.874016 | 0.854545 | 0.797214 | 36 | 2 | 6 | 11 | 0.947368 |
| 1 | Naive Bayes | 0.636364 | 0.500000 | 0.685039 | 0.636364 | 0.720588 | 19 | 19 | 1 | 16 | 0.500000 |
| 2 | KNN | 0.872727 | 0.947368 | 1.000000 | 0.872727 | 0.826625 | 36 | 2 | 5 | 12 | 0.947368 |
| 3 | Support Vector Machine | 0.781818 | 0.921053 | 0.937008 | 0.781818 | 0.695820 | 35 | 3 | 9 | 8 | 0.921053 |
| 4 | Decision Tree | 0.672727 | 0.789474 | 1.000000 | 0.672727 | 0.600619 | 7 | 10 | 8 | 30 | 0.411765 |
| 5 | Bagging | 0.927273 | 1.000000 | 1.000000 | 0.927273 | 0.882353 | 13 | 4 | 0 | 38 | 0.764706 |
| 6 | AdaBoosting | 0.872727 | 0.921053 | 0.968504 | 0.872727 | 0.842879 | 13 | 4 | 3 | 35 | 0.764706 |
| 7 | GradientBoost | 0.872727 | 0.921053 | 0.984252 | 0.872727 | 0.842879 | 13 | 4 | 3 | 35 | 0.764706 |
| 8 | Random Forest | 0.927273 | 0.973684 | 1.000000 | 0.927273 | 0.898607 | 14 | 3 | 1 | 37 | 0.823529 |
Note : f1-Score has already been calculated before and is placed inside the classification_report_list. We can access it direclty after some proper cleaning. But that takes away all the fun in programming. So, I am doing it here manually.
#Calculation f1 Score
Model_complete["f1_score"]=2*(Model_complete["Precision"]*Model_complete["Recall"])/(Model_complete["Precision"]+Model_complete["Recall"])
Model_complete
| Model | Accuracy | Recall | Train_Score | Test_Score | Area_Under_Curve | TP | FP | FN | TN | Precision | f1_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.854545 | 0.947368 | 0.874016 | 0.854545 | 0.797214 | 36 | 2 | 6 | 11 | 0.947368 | 0.947368 |
| 1 | Naive Bayes | 0.636364 | 0.500000 | 0.685039 | 0.636364 | 0.720588 | 19 | 19 | 1 | 16 | 0.500000 | 0.500000 |
| 2 | KNN | 0.872727 | 0.947368 | 1.000000 | 0.872727 | 0.826625 | 36 | 2 | 5 | 12 | 0.947368 | 0.947368 |
| 3 | Support Vector Machine | 0.781818 | 0.921053 | 0.937008 | 0.781818 | 0.695820 | 35 | 3 | 9 | 8 | 0.921053 | 0.921053 |
| 4 | Decision Tree | 0.672727 | 0.789474 | 1.000000 | 0.672727 | 0.600619 | 7 | 10 | 8 | 30 | 0.411765 | 0.541237 |
| 5 | Bagging | 0.927273 | 1.000000 | 1.000000 | 0.927273 | 0.882353 | 13 | 4 | 0 | 38 | 0.764706 | 0.866667 |
| 6 | AdaBoosting | 0.872727 | 0.921053 | 0.968504 | 0.872727 | 0.842879 | 13 | 4 | 3 | 35 | 0.764706 | 0.835629 |
| 7 | GradientBoost | 0.872727 | 0.921053 | 0.984252 | 0.872727 | 0.842879 | 13 | 4 | 3 | 35 | 0.764706 | 0.835629 |
| 8 | Random Forest | 0.927273 | 0.973684 | 1.000000 | 0.927273 | 0.898607 | 14 | 3 | 1 | 37 | 0.823529 | 0.892334 |
Comments :
I have deliberately not taken into account that definitely some models are overfitting with perfect Train score which is completley unusual. It might be resolved, if we go for either
Observations :
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from mlxtend.classifier import StackingCVClassifier
from sklearn.svm import NuSVC
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
scaler=StandardScaler()
x, y = make_classification(n_samples = 1000, n_features = 30, n_informative = 5,
n_redundant = 15, n_repeated = 5,
n_clusters_per_class = 2, class_sep = 0.5,
random_state = 1000, shuffle = False)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=100)
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test) #could have used the above function but, its fine.
#Importing the classifiers from the earlier declarations and considerations.
# You can add as many as you want here.
#classifier_models=[LR_model,NB_model,KNN,support_vector,decision_tree,KNN,bagging,adaboost,gradientboost,random_forest]
#MLP=MLPClassifiers()
#classifiers.append(MLPClassifier)
#N_support_vector=NuSVC()
#classifiers.append(N_support_vector)
print(x_train)
classifier=[]
# Initializing Support Vector classifier
classifier.append(SVC(C = 50, degree = 1, gamma = "auto", kernel = "rbf", probability = True))
# Initializing Multi-layer perceptron classifier
classifier.append(MLPClassifier(activation = "relu", alpha = 0.1, hidden_layer_sizes = (10,10,10),
learning_rate = "constant", max_iter = 2000, random_state = 1000))
# Initialing Nu Support Vector classifier
classifier.append(NuSVC(degree = 1, kernel = "rbf", nu = 0.25, probability = True))
# Initializing Random Forest classifier
classifier.append(RandomForestClassifier(n_estimators = 500, criterion = "gini", max_depth = 10,
max_features = "auto", min_samples_leaf = 0.005,
min_samples_split = 0.005, n_jobs = -1, random_state = 1000))
stack_of_classifiers=StackingCVClassifier(classifiers=classifier_models,meta_classifier=SVC)
[[ 0.55037488 -0.20032373 0.27082748 ... -0.62693534 -0.82933286 0.24926192] [ 0.55782292 0.80350043 0.55580832 ... -0.76621498 0.78378332 0.95047561] [ 0.71086068 -1.53852209 0.38493797 ... -0.36997037 -1.09136342 -1.42372694] ... [-1.80235737 0.45539874 0.29053724 ... 0.34399614 1.28251015 0.24389846] [-0.61935276 -2.31728278 1.36695961 ... -0.21809112 -0.36127543 0.91816354] [ 0.69428603 0.54574434 -0.17001 ... 0.96612137 -1.87344526 0.81143874]]
#classifiers as a stack
sclf = StackingCVClassifier(classifiers = classifier,
shuffle = False,
use_probas = True,
cv = 5,
meta_classifier = SVC(probability = True))
classifiers = {"SVC": classifier[0],
"MLP": classifier[1],
"NuSVC": classifier[2],
"RF": classifier[3],
"Stack": sclf}
# Train classifiers
for key in classifiers:
# Get classifier
classifier = classifiers[key]
# Fit classifier
classifier.fit(x_train, y_train)
# Save fitted classifier
classifiers[key] = classifier
# Get results
results = pd.DataFrame()
for key in classifiers:
# Make prediction on test set
y_pred = classifiers[key].predict_proba(x_test)[:,1]
# Save results in pandas dataframe object
results[f"{key}"] = y_pred
# Add the test set to the results object
results["Target"] = y_test
# Probability Distributions Figure
# Set graph style
sns.set(font_scale = 1)
sns.set_style({"axes.facecolor": "1.0", "axes.edgecolor": "0.85", "grid.color": "0.85",
"grid.linestyle": "-", 'axes.labelcolor': '0.4', "xtick.color": "0.4",
'ytick.color': '0.4'})
# Plot
f, ax = plt.subplots(figsize=(13, 4), nrows=1, ncols = 5)
for key, counter in zip(classifiers, range(5)):
# Get predictions
y_pred = results[key]
# Get AUC
auc = metrics.roc_auc_score(y_test, y_pred)
textstr = f"AUC: {auc:.3f}"
# Plot false distribution
false_pred = results[results["Target"] == 0]
sns.distplot(false_pred[key], hist=True, kde=False,
bins=int(25), color = 'red',
hist_kws={'edgecolor':'black'}, ax = ax[counter])
# Plot true distribution
true_pred = results[results["Target"] == 1]
sns.distplot(results[key], hist=True, kde=False,
bins=int(25), color = 'green',
hist_kws={'edgecolor':'black'}, ax = ax[counter])
# These are matplotlib.patch.Patch properties
props = dict(boxstyle='round', facecolor='white', alpha=0.5)
# Place a text box in upper left in axes coords
ax[counter].text(0.05, 0.95, textstr, transform=ax[counter].transAxes, fontsize=14,
verticalalignment = "top", bbox=props)
# Set axis limits and labels
ax[counter].set_title(f"{key} Distribution")
ax[counter].set_xlim(0,1)
ax[counter].set_xlabel("Probability")
# Tight layout
plt.tight_layout()
# Save Figure
plt.savefig("Probability Distribution for each Classifier.png", dpi = 1080)
# Define parameter grid
params = {"meta_classifier__kernel": ["linear", "rbf", "poly"],
"meta_classifier__C": [1, 2],
"meta_classifier__degree": [3, 4, 5],
"meta_classifier__probability": [True]}
# Initialize GridSearchCV
grid = GridSearchCV(estimator = sclf,
param_grid = params,
cv = 5,
scoring = "roc_auc",
verbose = 10,
n_jobs = -1)
# Fit GridSearchCV
grid.fit(x_train, y_train)
# Making prediction on test set
y_pred = grid.predict_proba(x_test)[:,1]
# Getting AUC
auc = metrics.roc_auc_score(y_test, y_pred)
# Print results
print(f"The AUC of the tuned Stacking classifier is {auc:.3f}")
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers. [Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 27.7s [Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 54.0s [Parallel(n_jobs=-1)]: Done 29 tasks | elapsed: 59.0s [Parallel(n_jobs=-1)]: Done 40 tasks | elapsed: 1.4min [Parallel(n_jobs=-1)]: Done 53 tasks | elapsed: 1.9min [Parallel(n_jobs=-1)]: Done 69 out of 90 | elapsed: 2.4min remaining: 43.4s [Parallel(n_jobs=-1)]: Done 79 out of 90 | elapsed: 2.5min remaining: 21.0s [Parallel(n_jobs=-1)]: Done 90 out of 90 | elapsed: 2.8min finished
The AUC of the tuned Stacking classifier is 0.885
# Classifier labels
classifier_labels = ["SVC", "MLP", "NuSVC", "RF"]
# Get all unique combinations of classifier with a set size greater than or equal to 2
combo_classifiers = []
for ii in range(2, len(classifier_labels)+1):
for subset in itertools.combinations(classifier_labels, ii):
combo_classifiers.append(subset)
# Stack, tune, and evaluate stack of classifiers
for combo in combo_classifiers:
# Get labels of classifier to create a stack
labels = list(combo)
# Get classifiers
classifier_combo = []
for ii in range(len(labels)):
label = classifier_labels[ii]
classifier = classifiers[label]
classifier_combo.append(classifier)
# Initializing the StackingCV classifier
sclf = StackingCVClassifier(classifiers = classifier_combo,
shuffle = False,
use_probas = True,
cv = 5,
meta_classifier = SVC(probability = True),
n_jobs = -1)
# Initialize GridSearchCV
grid = GridSearchCV(estimator = sclf,
param_grid = params,
cv = 5,
scoring = "roc_auc",
verbose = 0,
n_jobs = -1)
# Fit GridSearchCV
grid.fit(x_train, y_train)
# Making prediction on test set
y_pred = grid.predict_proba(x_test)[:,1]
# Getting AUC
auc = metrics.roc_auc_score(y_test, y_pred)
# Print results
print(f"AUC of stack {combo}: {auc:.3f}")
AUC of stack ('SVC', 'MLP'): 0.861
AUC of stack ('SVC', 'NuSVC'): 0.862
AUC of stack ('SVC', 'RF'): 0.862
AUC of stack ('MLP', 'NuSVC'): 0.861
AUC of stack ('MLP', 'RF'): 0.864
AUC of stack ('NuSVC', 'RF'): 0.858
AUC of stack ('SVC', 'MLP', 'NuSVC'): 0.868
AUC of stack ('SVC', 'MLP', 'RF'): 0.870
AUC of stack ('SVC', 'NuSVC', 'RF'): 0.868
AUC of stack ('MLP', 'NuSVC', 'RF'): 0.867
AUC of stack ('SVC', 'MLP', 'NuSVC', 'RF'): 0.884